blob: ebc361204255a343aa2a7f408acfab03269f654c [file] [log] [blame]
/* AUTORIGHTS
Copyright (C) 2007 Princeton University
This file is part of Ferret Toolkit.
Ferret Toolkit is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
/* the raw data table */
#include <cass.h>
static int raw_init_private(cass_table_t *table, const char *param)
{
int ret;
struct raw_private *priv = type_calloc(struct raw_private, 1);
if (priv == NULL) return CASS_ERR_OUTOFMEM;
ret = cass_dataset_init(&priv->dataset, table->cfg->vec_size, table->cfg->vec_dim, CASS_DATASET_BOTH);
if (ret != 0) { free(priv); priv = NULL; }
table->__private = priv;
return ret;
}
int raw_dump(cass_table_t *table)
{
int ret;
FILE *fout;
struct raw_private *priv = (struct raw_private *)table->__private;
assert(table->loaded);
fout = fopen(table->filename, "w");
if (fout == NULL) return CASS_ERR_IO;
ret = cass_dataset_dump(&priv->dataset, fout);
fclose(fout);
if (ret == 0) table->dirty = 0;
return ret;
}
static int raw_checkpoint_private(cass_table_t *table, CASS_FILE *out)
{
int ret;
struct raw_private *priv = (struct raw_private *)table->__private;
if (table->loaded && table->dirty)
{
ret = raw_dump(table);
if (ret != 0) return ret;
}
return cass_dataset_checkpoint(&priv->dataset, out);
}
static int raw_restore_private(cass_table_t *table, CASS_FILE *in)
{
int ret;
struct raw_private *priv = type_calloc(struct raw_private, 1);
if (priv == NULL) return CASS_ERR_OUTOFMEM;
ret = cass_dataset_restore(&priv->dataset, in);
if (ret != 0)
{
free(priv);
priv = NULL;
}
table->__private = priv;
return ret;
}
int raw_load(cass_table_t *table)
{
int err;
FILE *fin;
struct raw_private *priv = (struct raw_private *)table->__private;
assert(!table->loaded);
if (priv->dataset.num_vecset == 0)
{
priv->dataset.loaded = 1;
return 0;
}
fin = fopen(table->filename, "r");
if (fin == NULL) return CASS_ERR_IO;
err = cass_dataset_load(&priv->dataset, fin, table->cfg->vec_type);
fclose(fin);
return err;
}
int raw_release(cass_table_t *table)
{
int err = 0;
struct raw_private *priv = (struct raw_private *)table->__private;
if (table->loaded && table->dirty) err = raw_dump(table);
if (err != 0) return err;
return cass_dataset_release(&priv->dataset);
}
static int raw_batch_insert(cass_table_t *table, cass_dataset_t * parent, cass_vecset_id_t start, cass_vecset_id_t end)
{
struct raw_private *priv = (struct raw_private *)table->__private;
return cass_dataset_merge(&priv->dataset, parent, start, end - start + 1, NULL, NULL);
}
#define MAX_PROB 100
static int raw_query(cass_table_t *table, cass_query_t *query, cass_result_t *result)
{
struct raw_private *priv = (struct raw_private *)table->__private;
cass_dataset_t *ds = &priv->dataset;
cass_vec_dist_t *vec_dist;
cass_vecset_dist_t *vecset_dist;
int r_threshold = param_get_int(query->extra_params, "-R", MAX_PROB);
cass_size_t orig_size;
cass_id_t i;
assert((query->flags & CASS_RESULT_BITMAPS) == 0);
assert((query->flags & CASS_RESULT_LISTS) == 0);
vec_dist = cass_reg_get(&table->env->vec_dist, query->vec_dist_id);
assert(vec_dist != NULL);
vecset_dist = cass_reg_get(&table->env->vecset_dist, query->vecset_dist_id);
assert(vecset_dist != NULL);
result->flags = CASS_RESULT_LIST | CASS_RESULT_DIST;
if (query->flags & CASS_RESULT_USERMEM)
{
if ((query->flags & CASS_RESULT_BITMAP) ||
((query->flags & CASS_RESULT_LIST) == 0)) debug("User allocated bitmap not supported.\n");
orig_size = result->u.list.size;
if (query->topk > result->u.list.size)
{
ARRAY_EXPAND(result->u.list, query->topk);
}
}
else
{
orig_size = 0;
if (query->topk > 0)
{
ARRAY_INIT_SIZE(result->u.list, query->topk);
}
else
{
ARRAY_INIT(result->u.list);
}
}
if (query->topk > 0)
{
assert(result->u.list.size >= query->topk);
/* set the result array length */
result->u.list.len = query->topk;
TOPK_INIT(result->u.list.data, dist, query->topk, CASS_DIST_MAX);
if (query->candidate != NULL)
{
if (query->candidate->flags & CASS_RESULT_LIST)
{
ARRAY_BEGIN_FOREACH(query->candidate->u.list, cass_list_entry_t cand)
{
cass_list_entry_t entry;
if (cand.id == CASS_ID_MAX) continue;
entry.id = cand.id;
entry.dist = vecset_dist->__class->dist(ds, entry.id, query->dataset, query->vecset_id, vec_dist, vecset_dist);
TOPK_INSERT_MIN(result->u.list.data, dist, query->topk, entry);
} ARRAY_END_FOREACH;
}
else
if (query->candidate->flags & CASS_RESULT_BITMAP)
{
uint32_t id = bitmap_get_size(&query->candidate->u.bitmap);
for (i = 0; i < bitmap_get_count(&query->candidate->u.bitmap); i++)
{
id = bitmap_getNext(&query->candidate->u.bitmap, id);
cass_list_entry_t entry;
entry.id = id;
entry.dist = vecset_dist->__class->dist(ds, entry.id, query->dataset, query->vecset_id, vec_dist, vecset_dist);
TOPK_INSERT_MIN(result->u.list.data, dist, query->topk, entry);
}
}
else assert(0);
}
else
{
for (i = 0; i < ds->num_vecset; i++)
{
if (rand() % MAX_PROB > r_threshold) continue;
cass_list_entry_t entry;
entry.dist = vecset_dist->__class->dist(ds, i, query->dataset, query->vecset_id, vec_dist, vecset_dist);
entry.id = i;
TOPK_INSERT_MIN(result->u.list.data, dist, query->topk, entry);
}
}
if (query->flags & CASS_RESULT_SORT)
{
TOPK_SORT_MIN(result->u.list.data, cass_list_entry_t, dist, query->topk);
result->flags |= CASS_RESULT_SORT;
}
}
else
{
ARRAY_TRUNC(result->u.list);
if (query->candidate != NULL)
{
if (query->candidate->flags & CASS_RESULT_LIST)
for (i = 0; i < query->candidate->u.list.len; i++)
{
cass_list_entry_t entry;
entry.id = query->candidate->u.list.data[i].id;
if (entry.id == CASS_ID_MAX) continue;
entry.dist = vecset_dist->__class->dist(ds, entry.id, query->dataset, query->vecset_id, vec_dist, vecset_dist);
if (entry.dist < query->range)
{
ARRAY_APPEND(result->u.list, entry);
}
}
else if (query->candidate->flags & CASS_RESULT_BITMAP)
{
uint32_t id = bitmap_get_size(&query->candidate->u.bitmap);
for (i = 0; i < bitmap_get_count(&query->candidate->u.bitmap); i++)
{
id = bitmap_getNext(&query->candidate->u.bitmap, id);
cass_list_entry_t entry;
entry.id = id;
entry.dist = vecset_dist->__class->dist(ds, entry.id, query->dataset, query->vecset_id, vec_dist, vecset_dist);
if (entry.dist < query->range)
{
ARRAY_APPEND(result->u.list, entry);
}
}
}
}
else
{
for (i = 0; i < ds->num_vecset; i++)
{
if (rand() % MAX_PROB > r_threshold) continue;
cass_list_entry_t entry;
entry.dist = vecset_dist->__class->dist(ds, i, query->dataset, query->vecset_id, vec_dist, vecset_dist);
entry.id = i;
if (entry.dist < query->range)
{
ARRAY_APPEND(result->u.list, entry);
}
}
}
}
if (orig_size == 0) result->flags |= CASS_RESULT_MALLOC;
else if (result->u.list.size > orig_size) result->flags |= CASS_RESULT_REALLOC;
return 0;
}
/* this function is used for multiple oprs, and is thus not static */
/*
int raw_drop(cass_table_t *table)
{
char buf[BUFSIZ];
mkpath(buf, table->env->base_dir, table->name, table->opr->name);
return unlink(buf);
}
*/
static int raw_free_private(cass_table_t *table)
{
if (table->loaded) raw_release(table);
free(table->__private);
return 0;
}
static char *raw_tune(cass_table_t *table, char *extra_input)
{
// struct raw_private *priv = (struct raw_private *)table->__private;
return NULL;
}
cass_table_opr_t opr_raw = {
.name = "raw",
.type = CASS_DATA | CASS_SEQUENTIAL,
.vecset_type = CASS_VECSET_SET,
.vec_type = CASS_ANY,
.dist_vecset = CASS_ANY,
.dist_vec = CASS_ANY,
.tune = raw_tune,
.init_private = raw_init_private,
.batch_insert = raw_batch_insert,
.query = raw_query,
.batch_query = NULL,
.load = raw_load,
.release = raw_release,
.checkpoint_private = raw_checkpoint_private,
.restore_private = raw_restore_private,
// .drop = raw_drop,
.free_private = raw_free_private
};