blob: ddb5c7f167e7d724ce5b14077acab2c85dfd5350 [file] [log] [blame]
/* AUTORIGHTS
Copyright (C) 2007 Princeton University
This file is part of Ferret Toolkit.
Ferret Toolkit is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <cass.h>
/* file format
*
* LINE 1: <# vecset> <vecset type: 1 | 2>
* LINE 2: <type : int | float> <dim>
* LINE 3: <id 1> <# vec>
* LINE 4: <weight> <vec 1>
* ...
* <id 2> <# vec>
* <weight> <vec 1>
* ...
*/
#warning to be extended to allow external id
static void eat_comment (FILE *fin)
{
int c;
for (;;)
{
c = fgetc(fin);
if (c != '#') break;
for (;;)
{
c = fgetc(fin);
if (c == '\n') break;
}
}
ungetc(c, fin);
}
static int _import_vec_int (int32_t *vec, cass_size_t dim, FILE *fin)
{
cass_size_t i;
for (i = 0; i < dim; i++)
{
if (fscanf(fin, "%d", &vec[i]) != 1) return CASS_ERR_IO;
}
return 0;
}
static int _import_vec_float (float *vec, cass_size_t dim, FILE *fin)
{
cass_size_t i;
for (i = 0; i < dim; i++)
{
if (fscanf(fin, "%f", &vec[i]) != 1) return CASS_ERR_IO;
}
return 0;
}
static int _import_vec_bit (chunk_t *vec, cass_size_t dim, FILE *fin)
{
int c;
cass_size_t i, j, k;
chunk_t chunk = 0;
for (;;)
{
c = fgetc(fin);
if (c == EOF) return CASS_ERR_IO;
if (c == '0' || c == '1') break;
}
j = k = 0;
ungetc(c, fin);
for (i = 0; i < dim; i++)
{
c = fgetc(fin);
if (c != '0' && c != '1') return CASS_ERR_IO;
chunk = chunk << 1;
chunk |= c - '0';
j++;
if (j == CHUNK_BIT)
{
vec[k++] = chunk;
j = 0;
}
}
return 0;
}
static int _export_vec_int (int32_t *vec, cass_size_t dim, FILE *fout)
{
cass_size_t i;
for (i = 0; i < dim; i++)
{
if (i > 0) fprintf(fout, "\t");
if (fprintf(fout, "%d", vec[i]) <= 0) return CASS_ERR_IO;
}
fprintf(fout, "\n");
return 0;
}
static int _export_vec_float (float *vec, cass_size_t dim, FILE *fout)
{
cass_size_t i;
for (i = 0; i < dim; i++)
{
if (i > 0) fprintf(fout, "\t");
if (fprintf(fout, "%f", vec[i]) <= 0) return CASS_ERR_IO;
}
fprintf(fout, "\n");
return 0;
}
static int _export_vec_bit (chunk_t *vec, cass_size_t dim, FILE *fout)
{
int c, i, j, k;
chunk_t chunk;
j = k = 0;
chunk = vec[k];
for (i = 0; i < dim; i++)
{
c = chunk & 1;
chunk = chunk >> 1;
fprintf(fout, "%c", '0'+c);
j++;
if (j == CHUNK_BIT)
{
k++;
chunk = vec[k];
j = 0;
}
}
fprintf(fout, "\n");
return 0;
}
int cass_table_import_data(cass_table_t *table, char *fname)
{
int ret;
FILE *fin;
cass_size_t dim;
char objname[BUFSIZ];
uint32_t j, k, l;
cass_dataset_t *ds;
cass_vec_t *vec;
cass_vecset_id_t vecset_start;
assert(table->opr->type & CASS_DATA);
ds = &((struct raw_private *)table->__private)->dataset;
vecset_start = ds->num_vecset;
assert(table->loaded);
fin = fopen(fname, "r");
if (fin == NULL) return CASS_ERR_IO;
ret = CASS_ERR_IO;
dim = table->cfg->vec_dim;
for (;;)
{
eat_comment(fin);
if (fscanf(fin, "%s%u", objname, &l) != 2) break;
ret = cass_map_dataobj_to_id(table->map, objname, &k);
if (ret < 0) {
// new obj.
ret = cass_map_insert(table->map, &k, objname);
if (ret < 0)
goto err;
}
if (ds->num_vecset < k)
ds->num_vecset = k;
if (ds->num_vecset + 1 > ds->max_vecset)
{
ds->max_vecset = grow(ds->max_vecset, ds->num_vecset + 1);
ds->vecset = (cass_vecset_t *)realloc(ds->vecset, sizeof (cass_vecset_t) * ds->max_vecset);
ret = CASS_ERR_OUTOFMEM;
if (ds->vecset == NULL) return ret;
}
ret = CASS_ERR_IO;
ds->vecset[k].num_regions = l;
if (table->cfg->vecset_type == CASS_VECSET_SINGLE && l != 1) goto err;
ds->vecset[k].start_vecid = ds->num_vec;
if (ds->num_vec + l > ds->max_vec)
{
ds->max_vec = grow(ds->max_vec, ds->num_vec + l);
ds->vec = (cass_vec_t *)realloc(ds->vec, ds->vec_size * ds->max_vec);
ret = CASS_ERR_OUTOFMEM;
if (ds->vec == NULL) return ret;
ret = CASS_ERR_IO;
}
vec = (cass_vec_t *)((void *)ds->vec + (ds->num_vec * ds->vec_size));
for (j = 0; j < l; j++)
{
ret = CASS_ERR_IO;
vec->parent = ds->num_vecset;
eat_comment(fin);
if (fscanf(fin, "%f", &vec->weight) != 1) goto err;
switch (table->cfg->vec_type)
{
case CASS_VEC_INT: ret = _import_vec_int(vec->u.int_data, dim, fin); break;
case CASS_VEC_FLOAT: ret = _import_vec_float(vec->u.float_data, dim, fin); break;
case CASS_VEC_BIT: ret = _import_vec_bit(vec->u.bit_data, dim, fin); break;
default: assert(0);
}
if (ret != 0) goto err;
vec = (cass_vec_t *)((void *)vec + ds->vec_size);
ds->num_vec++;
}
ds->num_vecset++;
}
table->dirty = 1;
fclose(fin);
fin = NULL;
/* cascade insert */
ARRAY_BEGIN_FOREACH(table->children, cass_table_t *p)
{
ret = cass_table_batch_insert(p, ds, vecset_start, ds->num_vecset -1);
if (ret != 0) goto err;
} ARRAY_END_FOREACH;
return 0;
err:
if (fin != NULL) fclose(fin);
return ret;
}
int cass_table_import_direct_data(cass_table_t *table, char *fname)
{
int ret;
FILE *fin;
cass_size_t dim;
uint32_t j, k, l;
cass_dataset_t *ds;
cass_vec_t *vec;
cass_vecset_id_t vecset_start;
assert(table->opr->type & CASS_DATA);
ds = &((struct raw_private *)table->__private)->dataset;
vecset_start = ds->num_vecset;
assert(table->loaded);
fin = fopen(fname, "r");
if (fin == NULL) return CASS_ERR_IO;
ret = CASS_ERR_IO;
dim = table->cfg->vec_dim;
for (;;)
{
if (ds->num_vecset + 1 > ds->max_vecset)
{
ds->max_vecset = grow(ds->max_vecset, ds->num_vecset + 1);
ds->vecset = (cass_vecset_t *)realloc(ds->vecset, sizeof (cass_vecset_t) * ds->max_vecset);
ret = CASS_ERR_OUTOFMEM;
if (ds->vecset == NULL) return ret;
}
ret = CASS_ERR_IO;
eat_comment(fin);
if (fscanf(fin, "%u%u", &k, &l) != 2) break;
if (k != ds->num_vecset) goto err;
ds->vecset[k].num_regions = l;
if (table->cfg->vecset_type == CASS_VECSET_SINGLE && l != 1) goto err;
ds->vecset[k].start_vecid = ds->num_vec;
if (ds->num_vec + l > ds->max_vec)
{
ds->max_vec = grow(ds->max_vec, ds->num_vec + l);
ds->vec = (cass_vec_t *)realloc(ds->vec, ds->vec_size * ds->max_vec);
ret = CASS_ERR_OUTOFMEM;
if (ds->vec == NULL) return ret;
ret = CASS_ERR_IO;
}
vec = (cass_vec_t *)((void *)ds->vec + (ds->num_vec * ds->vec_size));
for (j = 0; j < l; j++)
{
ret = CASS_ERR_IO;
vec->parent = ds->num_vecset;
eat_comment(fin);
if (fscanf(fin, "%f", &vec->weight) != 1) goto err;
switch (table->cfg->vec_type)
{
case CASS_VEC_INT: ret = _import_vec_int(vec->u.int_data, dim, fin); break;
case CASS_VEC_FLOAT: ret = _import_vec_float(vec->u.float_data, dim, fin); break;
case CASS_VEC_BIT: ret = _import_vec_bit(vec->u.bit_data, dim, fin); break;
default: assert(0);
}
if (ret != 0) goto err;
vec = (cass_vec_t *)((void *)vec + ds->vec_size);
ds->num_vec++;
}
ds->num_vecset++;
}
table->dirty = 1;
fclose(fin);
fin = NULL;
/* cascade insert */
ARRAY_BEGIN_FOREACH(table->children, cass_table_t *p)
{
ret = cass_table_batch_insert(p, ds, vecset_start, ds->num_vecset -1);
if (ret != 0) goto err;
} ARRAY_END_FOREACH;
return 0;
err:
if (fin != NULL) fclose(fin);
return ret;
}
int cass_table_export_data(cass_table_t *table, char *fname)
{
FILE *fout;
cass_dataset_t *ds;
cass_vec_t *vec;
uint32_t i, j, k, dim;
char *p;
assert(table->opr->type & CASS_DATA ||
table->opr->type & CASS_VEC_SKETCH);
ds = &((struct raw_private *)table->__private)->dataset;
assert(ds->loaded);
fout = fopen(fname, "w");
if (fout == NULL) return CASS_ERR_IO;
dim = table->cfg->vec_dim;
if (table->cfg->vecset_type == CASS_VECSET_NONE) k = 0;
else if (table->cfg->vecset_type == CASS_VECSET_SINGLE) k = 1;
else if (table->cfg->vecset_type == CASS_VECSET_SET) k = 2;
else assert(0);
fprintf(fout, "# %u\t%u\n", ds->num_vecset, k);
if (table->cfg->vec_type == CASS_VEC_INT)
{
p = "int";
}
else if (table->cfg->vec_type == CASS_VEC_FLOAT)
{
p = "float";
}
else if (table->cfg->vec_type == CASS_VEC_BIT)
{
p = "bit";
}
fprintf(fout, "# %s\t%u\n", p, dim);
if ((ds->flags & CASS_DATASET_BOTH) == CASS_DATASET_BOTH)
{
vec = ds->vec;
for (i = 0; i < ds->num_vecset; i++)
{
char *name;
cass_map_id_to_dataobj(table->map, i, &name);
fprintf(fout, "%s\t%u\n", name, ds->vecset[i].num_regions);
k = ds->vecset[i].start_vecid;
assert((void *)ds->vec + k * ds->vec_size == vec);
for (j = 0; j < ds->vecset[i].num_regions; j++)
{
fprintf(fout, "%f\t", vec->weight);
switch (table->cfg->vec_type)
{
case CASS_VEC_INT: _export_vec_int(vec->u.int_data, dim, fout); break;
case CASS_VEC_FLOAT: _export_vec_float(vec->u.float_data, dim, fout); break;
case CASS_VEC_BIT: _export_vec_bit(vec->u.bit_data, dim, fout); break;
default: assert(0);
}
vec = (cass_vec_t *)((void *)vec + ds->vec_size);
}
}
}
else if (ds->flags & CASS_DATASET_VEC)
{
vec = ds->vec;
fprintf(fout, "#weight\tparent\tvector\n");
for (i = 0; i < ds->num_vec; i++)
{
fprintf(fout, "%f\t", vec->weight);
fprintf(fout, "%u\t", vec->parent);
switch (table->cfg->vec_type)
{
case CASS_VEC_INT: _export_vec_int(vec->u.int_data, dim, fout); break;
case CASS_VEC_FLOAT: _export_vec_float(vec->u.float_data, dim, fout); break;
case CASS_VEC_BIT: _export_vec_bit(vec->u.bit_data, dim, fout); break;
default: assert(0);
}
vec = (cass_vec_t *)((void *)vec + ds->vec_size);
}
}
fclose(fout);
return 0;
}