Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

data.cpp

Go to the documentation of this file.
00001 //****************************************************
00002 //  April, 1993, University of Illinois
00003 // Copyright (C) 1993, 1994 Tianlin Wang
00004 /* Copyright (C) 1994-2003 Matvec Development Team. 
00005 
00006   This program is free software; you can redistribute it and/or
00007   modify it under the terms of the GNU Library General Public
00008   License as published by the Free Software Foundation; either
00009   version 2 of the License, or (at your option) any later version.
00010   
00011   This program is distributed in the hope that it will be useful,
00012   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014   Library General Public License for more details.
00015     
00016   You should have received a copy of the GNU Library General Public
00017   License along with this library; if not, write to the Free
00018   Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
00019   MA 02111-1307, USA 
00020 */
00021 
00022 #include <iostream>
00023 #include <iomanip>
00024 #include <cstdio>
00025 #include <vector>
00026 #include <string>
00027 
00028 #include "session.h"
00029 #include "util.h"
00030 #include "doublematrix.h"
00031 #include "data.h"
00032 
00033 namespace matvec {
00034 
00035 Data::Data(void)
00036 {
00037    numcol     = 0;
00038    maxnumcol  = 0;
00039    numrec     = 0;
00040    new_col    = 1;
00041    datasheet  = 0;
00042    hashtable  = 0;
00043    resize(0,0,20);
00044    tdfname = SESSION.mktemp();
00045 }
00046 
00047 Data::Data(Data& D)
00048 {
00049    numcol     = 0;
00050    maxnumcol  = 0;
00051    numrec     = 0;
00052    datasheet  = 0;
00053    hashtable  = 0;
00054    copyfrom(D);
00055 }
00056 
00057 const Data& Data::operator=(Data& A)
00058 {
00059    copyfrom(A);
00060    return *this;
00061 }
00062 
00063 const Data& Data::operator=(const Field& V)
00064 {
00065    if (new_col == 1) {  // first column intercept can't be overwritten
00066       resize(V.len(),numcol,20);
00067    }
00068    else {
00069       if (V.len() != numrec) {
00070          warning("Data = Col: size incompatible");
00071          return *this;
00072       }
00073    }
00074    if (!data_in_memory) input_datasheet();
00075    std::string cname = datasheet[new_col].name();
00076    int indx = datasheet[new_col].index();
00077    datasheet[new_col] = V;
00078    datasheet[new_col].name(cname);
00079    datasheet[new_col].index(indx);
00080    if (V.hashtable) hashtable[new_col]->copyfrom( *(V.hashtable));
00081    save_datasheet(0);     // save the changes on disk, but keep them in memory
00082    return *this;
00083 }
00084 
00085 void Data::copyfrom(Data& A)
00086 {
00087    if (this == &A) return;
00088    if (A.data_on_disk == 0) A.save_datasheet();
00089    resize(A.numrec,A.numcol,A.maxnumcol);
00090    new_col = A.new_col;
00091    for (unsigned i=0; i<numcol; i++) hashtable[i]->copyfrom(*(A.hashtable[i]));
00092    tdfname = A.tdfname;
00093    data_on_disk   = 1;
00094    data_in_memory = 0;
00095    input_datasheet();
00096    save_datasheet(0);              // save changes
00097 }
00098 
00099 Data& Data::resize(const unsigned nr,const unsigned nc,const unsigned mc)
00100 {
00101    if (numrec == nr && numcol == nc && maxnumcol== mc) return *this;
00102    release();
00103    numrec    = nr;
00104    numcol    = nc;
00105    if (mc < nc) {
00106       maxnumcol = nc + 10;          // 10 is the buffer columns
00107    }
00108    else {
00109       maxnumcol = mc + 1;           // first column is reserved for intercept
00110    }
00111    if (numrec==0) numcol = 0;
00112    if (numcol==0) numrec = 0;
00113 
00114    data_in_memory = 1;
00115    data_on_disk = 0;
00116    hashtable = new HashTable *[maxnumcol];
00117    check_ptr(hashtable);
00118    unsigned i;
00119    for (i=0; i<maxnumcol; i++) {
00120       hashtable[i] = new HashTable;
00121       check_ptr(hashtable[i]);
00122    }
00123    datasheet = new Field [maxnumcol];
00124    check_ptr(datasheet);
00125    datasheet[0].resize(0);         // first column is reserved for intercept
00126    for (i=1; i<numcol; i++) datasheet[i].resize(numrec);
00127    return *this;
00128 }
00129 
00130 int Data::field_index(const std::string &colname) const
00131 {
00132    for (unsigned i=0; i<numcol; i++) {
00133       if (datasheet[i].name() == colname) {
00134          return datasheet[i].index();
00135       }
00136    }
00137    return -1;
00138 }
00139 
00140 void Data::field_index_vec(Vector<int> &ivec,const std::string &fdname)
00141 {
00142    if (numcol<1) {   // first column is a reserved: intercept
00143       return;
00144    }
00145    unsigned i,nc;
00146    if (fdname == "") {
00147       nc = numcol-1;
00148       ivec.reserve(nc);
00149       for (i=0; i<nc; i++) ivec[i] = i+1;   // don't print intercept
00150    }
00151    else {
00152       int k,nskip,j;
00153       std::string sep(" ,");
00154       std::string fmt(fdname);
00155       std::vector<std::string> tmpvec;
00156       nc = split(fmt,sep,&tmpvec);  //////// fmt.split(n,sep);  field name can be any length
00157       Vector<int> tmpivec(nc);
00158       for (nskip=0,i=0; i<nc; i++) {
00159          k = field_index(tmpvec[i]);
00160          if (k<0) {
00161             warning("Data::field_index_vec(): %s: unknown, it's skipped",tmpvec[i].c_str());
00162             nskip++;
00163          }
00164          tmpivec[i] = k;
00165       }
00166 
00167       ivec.reserve(nc - nskip);
00168       for (j=0,i=0; i<nc; i++) {
00169          if (tmpivec[i] >= 0) ivec[j++] = tmpivec[i];
00170       }
00171    }
00172    return;
00173 }
00174 
00175 void Data::input(const std::string &fname,const std::string &recfmt)
00176 {
00177    size_t linewidth = 1024;
00178    char *line = new char [linewidth];
00179    int k;
00180    unsigned i,j,nc,nr,id;
00181    if (recfmt ==  "") {
00182       warning("Data::input(): no column-name specified");
00183       return;
00184    }
00185    std::string tmpstr;
00186    tmpstr = recfmt;
00187    i = 0;
00188    while (tmpstr[i] == ' ') {i++;}  // find first nonspace char
00189    if (tmpstr[i] == '$') throw exception("Data::input(): $ is misplaced");
00190    i = 0;
00191    while (tmpstr[i]) {             // move $ to the end of each token
00192       if (tmpstr[i] == '$' ) {
00193          tmpstr[i] = ' ';
00194          j = i;
00195          while (tmpstr[--j] == ' ');
00196          tmpstr[++j] = '$';
00197       }
00198       i++;
00199    }
00200    std::string fmt = "intercept ";      // first column is reserved for intercept
00201    fmt.append(tmpstr);
00202 
00203    std::string sep(" ,");
00204    std::vector<std::string> tmpvec;
00205    unsigned tncol = split(fmt,sep,&tmpvec); ///// split(tncol,sep);  tncol >= 1 is required
00206    nc = tncol;
00207    for (i=0; i<tncol; ++i) if (tmpvec[i] == "_skip") nc--;
00208    std::ifstream  in(fname.c_str(),std::ios::in);
00209    if (!in) {
00210      if(line){
00211       delete [] line;
00212       line=0;
00213      }
00214       throw exception("Data::input(): cannot open file");
00215    }
00216    if (!in.getline(line,linewidth)) {
00217       warning("Data::input(): empty datafile: %s",fname.c_str());
00218       if(line){
00219         delete [] line;
00220         line=0;
00221       }
00222       return;
00223    }
00224    while (!validline(line)) {
00225       if (!in.getline(line,linewidth)) {
00226          warning("Data::input(): no real data in datafile: %s",fname.c_str());
00227          if(line){
00228            delete [] line;
00229            line=0;}
00230          return;
00231       }
00232    }
00233    std::string T(line);
00234    i = split(T," ");
00235    if (i < tncol-1) {
00236      if(line){
00237        delete [] line;
00238        line=0;
00239      }
00240       throw exception("Data::input(): the # of columns in data < the expected");
00241       return;
00242    }
00243    in.clear();
00244    in.seekg(0,std::ios::beg);
00245    nr = 0;
00246    while (in.getline(line,linewidth)) if (validline(line)) nr++;
00247    resize(nr,nc);
00248    int ThereareStrcol = 0;
00249    Vector<int> intvec(tncol);
00250    std::string tstr;
00251    for (i=0; i<tncol; i++) {
00252       tstr = tmpvec[i];
00253       if (tstr.find("_skip") >= 0) {
00254          for (k=i+1; k<tncol; k++) {
00255             if (tstr == tmpvec[k]) {
00256               if(line){
00257                 delete [] line;
00258                 line=0;
00259               }
00260                throw exception("Data::input(): duplicated column names");
00261             }
00262          }
00263       }
00264    }
00265    std::string::size_type begidx;
00266    for (k=0,i=0; i<tncol; i++) {
00267       if (tmpvec[i] == "_skip") {
00268          intvec[i] = -1;
00269       }
00270       else {
00271          intvec[i] = k;
00272          begidx = tmpvec[i].find("$");
00273          if (begidx != std::string::npos) {
00274             tmpvec[i].replace(begidx,1,"");
00275             datasheet[k].type('S');      // string column
00276             ThereareStrcol = 1;
00277             hashtable[k]->resize(numrec);
00278          }
00279          datasheet[k].name(tmpvec[i]);
00280          datasheet[k].index(k);
00281          k++;
00282       }
00283    }      // k == numcol-1
00284    char *token;
00285    std::fstream tdatfile(tdfname.c_str(),std::ios::out);
00286 
00287    if (!tdatfile) {
00288      if(line){
00289       delete [] line;
00290       line=0;
00291      }
00292       throw exception("Data::input(): cannot open file");
00293    }
00294    DataNode* dat_cell;
00295    double x;
00296    char *endpt;
00297    j = 0;
00298 
00299    in.clear();
00300    in.seekg(0L,std::ios::beg);                // rewind data file
00301    while (in.getline(line,linewidth)) {
00302       if (validline(line)) {
00303          token = strtok(line,", ");
00304          i = 1;
00305          while (token) {
00306             if (i >= tncol) break;
00307             k = intvec[i++];
00308             if (k > 0) {
00309                dat_cell = &datasheet[k][j];
00310                if (strcmp(token,".")) {
00311                   dat_cell->missing = 0;
00312                   if (datasheet[k].type() == 'S') {
00313                      hashtable[k]->insert(token);
00314                      id = strlen(token)+1;
00315                      tdatfile.write((char *)&id,sizeof(unsigned));
00316                      tdatfile.write(token,id);
00317                   }
00318                   else {
00319                      x = strtod(token,&endpt);   // sscanf(token,"%lf",&x);
00320                      if (*endpt == '\0') {
00321                         dat_cell->double_val(x);
00322                      }
00323                      else {
00324                         warning("Data::input(): numeric column has non-numerics "
00325                               "at the corner of row %d and column %d.\n"
00326                               "  SUGGESTION: claim it as string column in"
00327                               " D.input() with $ sign",
00328                               j+1,i-1);
00329                         resize(0,0);
00330                         in.close();
00331                         tdatfile.close();
00332                         if(line){
00333                         delete [] line;
00334                         line=0;
00335                         }
00336                         return;
00337                      }
00338                   }
00339                }
00340                else {
00341                   dat_cell->missing = 1;
00342                   datasheet[k].count_miss(1);
00343                }
00344             }
00345             token = strtok('\0',", ");
00346          }
00347          j++;
00348       }  // end of validline(line)
00349    }
00350    in.close();
00351    tdatfile.close();
00352    datasheet[0].type('I');    // I = type for intercept
00353    datasheet[0].nlevel(1);    // I = type for intercept
00354    datasheet[0].nmiss(0);
00355 
00356    /////////////////////////////////////////////////////
00357    //   now re-hash for each string field, if necessary
00358    ////////////////////////////////////////////////////
00359    if (ThereareStrcol) {
00360       for (i=1; i<numcol; i++) {
00361          if (datasheet[i].type() != 'S') continue;
00362          id = hashtable[i]->size();
00363          hashtable[i]->resize(id);
00364          datasheet[i].nlevel(id);
00365       }
00366       tdatfile.open(tdfname.c_str(),std::ios::in);
00367       for (i=0; i<numrec; i++) {
00368          for (j=1; j<numcol; j++) {
00369             dat_cell = &datasheet[j][i];
00370             if (datasheet[j].type() == 'S' && !(dat_cell->missing)) {
00371                tdatfile.read((char *)&id,sizeof(unsigned));
00372                tdatfile.read(line,id);
00373                id = hashtable[j]->insert(line);
00374                dat_cell->unsigned_val(id);
00375             }
00376          }
00377       }
00378       tdatfile.close();
00379    }
00380    if(line){
00381      delete [] line;
00382      line=0;
00383    }
00384    ////////////////////////////////////////////////////////////////////
00385    // save a copy of data is a must. Because data could be changed
00386    // temporarily for some special purposes, the change can be droped
00387    // by release datasheet
00388    ////////////////////////////////////////////////////////////////////
00389    save_datasheet(0);    // save a copy to hard-disk
00390 }
00391 
00392 /*
00393 void Data::drop(const char *fdname)
00394 {
00395    int nc=0;
00396    int *intvec = field_index_vec(nc,fdname);
00397    for (int i=0; i<nc; i++) {
00398 
00399    }
00400    if (intvec) delete [] intvec;
00401 
00402 }
00403 
00404 void Data::keep(const char *fdname)
00405 {
00406    int nc=0;
00407    int *intvec = field_index_vec(nc,fdname);
00408    for (int i=0; i<nc; i++) {
00409 
00410    }
00411    if (intvec) delete [] intvec;
00412 }
00413 */
00414 
00415 void Data::value_for_missing(const double vm)
00416 {
00417    if (!data_in_memory) input_datasheet();
00418    for (unsigned i=0; i<numcol; i++) datasheet[i].value_for_missing(vm);
00419 }
00420 
00421 void Data::save_datasheet(const int relse)
00422 {
00423    if (!datasheet) {
00424       warning("Data::save_datasheet(): no data to save");
00425       return;
00426    }
00427    std::ofstream df(tdfname.c_str(),std::ios::out);
00428    if (!df) throw exception("Data::save_datasheet(): cannot open file");
00429    for (unsigned i=1; i<numcol; i++) {   // first column is an intercept
00430       df.write((char *)datasheet[i].dat_vec,numrec*sizeof(DataNode));
00431    }
00432    df.close();
00433    data_on_disk = 1;
00434    if (relse) release_datasheet();
00435 }
00436 
00437 void  Data::input_datasheet(void)
00438 {
00439    if (data_in_memory) return;
00440    if (data_on_disk) {
00441       std::ifstream df(tdfname.c_str());
00442       if (!df) throw exception("Data::input_datasheet(): cannot open file");
00443       for (unsigned i=1; i<numcol; i++) {  // first column is an intercept
00444          datasheet[i].resize(numrec);
00445          df.read((char *)datasheet[i].dat_vec,numrec*sizeof(DataNode));
00446       }
00447       df.close();
00448       data_in_memory = 1;                // data now is in memorry
00449    }
00450    else {
00451       warning("Data::input_datasheet(): data is not on disk");
00452    }
00453 }
00454 
00455 void Data::release_datasheet(void)
00456 {
00457    if (datasheet) {
00458       // any data must have a hard copy in disk
00459       if (!data_on_disk) save_datasheet();
00460       for (unsigned i=1; i<numcol; i++) datasheet[i].resize(0);
00461       data_in_memory = 0;     // data is not in memory, but should  on disk;
00462    }
00463 }
00464 
00465 void Data::row(const unsigned i,DataNode* recd)
00466 {
00467    if (!data_in_memory) input_datasheet();
00468    if (!recd) {
00469      if(numcol>0) {
00470        recd = new DataNode [numcol];
00471      }
00472      else {
00473        recd = 0;
00474      }
00475    }
00476    for (unsigned j=1; j<numcol; j++) recd[j] = datasheet[j][i];
00477 }
00478 
00479 Field Data::col(const std::string &cname)
00480 {
00481    int k =  field_index(cname);
00482    if (k<=0) {        // first column intercept is not accessible
00483       warning("Data::col(%s): no such column",cname.c_str());
00484       return Field();
00485    }
00486    if (!data_in_memory) input_datasheet();
00487 
00488    HashTable *tmp_hashtable = 0;
00489    DataNode *retval;
00490    if (numrec>0){
00491      retval = new DataNode [numrec];
00492    }
00493    else {
00494      retval = 0;
00495    }
00496    unsigned i;
00497    DataNode *colk = datasheet[k].dat_vec;
00498    if (datasheet[k].type()=='S') {
00499       tmp_hashtable = new HashTable;
00500       *tmp_hashtable = *(hashtable[k]);
00501       for (i=0; i<numrec; i++) {
00502          if (colk[i].missing) {retval[i].missing = 1;}
00503          else { retval[i].unsigned_val(colk[i].unsigned_val()); }
00504       }
00505    }
00506    else {
00507       for (i=0; i<numrec; i++) {
00508          if (colk[i].missing) { retval[i].missing = 1; }
00509          else { retval[i].double_val(colk[i].double_val()); }
00510       }
00511    }
00512    return Field(numrec,retval,datasheet[k].col_struct,tmp_hashtable);
00513 }
00514 
00515 DataNode* Data::rawcol(const std::string &cname)
00516 {
00517    int k =  field_index(cname);
00518    if (k > 0) {              // first column intercept is not accessible
00519       if (!data_in_memory) input_datasheet();
00520       return datasheet[k].dat_vec;
00521    }
00522    else {
00523       warning("Data::rawcol(%s): no such column",cname.c_str());
00524       return 0;
00525    }
00526 }
00527 
00528 DataNode* Data::rawcol(unsigned c)
00529 {
00530    if (c <= 0 || c >= numcol) throw exception("Data::rawcol(): out of range");
00531    if (!data_in_memory) input_datasheet();
00532    return datasheet[c].dat_vec;
00533 }
00534 
00535 Data& Data::newcol(const std::string &cname)
00536 {
00537    if (cname == "") {
00538       warning("Data::newcol(cname), cname is empty");
00539       return *this;
00540    }
00541    unsigned i;
00542    int k =  field_index(cname);
00543    if (k > 0) {         // first column intercept cannot be overwritten
00544       if (datasheet[k].type() == 'S') {
00545          warning("Data::newcol(): %s exits, can't overwrite string column",cname.c_str());
00546          return *this;
00547       }
00548       warning("Data.newcol(): %s exits, it's been overwritten",cname.c_str());
00549       new_col = k;
00550    }
00551    else {
00552       if (!data_in_memory) input_datasheet();   // data must be in memory
00553       if (numcol == maxnumcol) {               // data sheet is full
00554          Field *tmp_datasheet = new Field [maxnumcol];
00555          check_ptr(tmp_datasheet);
00556          HashTable **tmp_hashtable = new HashTable *[maxnumcol];
00557          check_ptr(tmp_hashtable);
00558          for (i=0; i<maxnumcol; i++) {
00559             tmp_datasheet[i] = datasheet[i];
00560             tmp_hashtable[i] = hashtable[i];
00561          }
00562          if(datasheet){
00563            delete [] datasheet;            // note I do not delete datasheet[i]
00564          datasheet=0;
00565          }
00566          if(hashtable){
00567            delete [] hashtable;            // note I do not delete hashtable[i]
00568            hashtable=0;
00569          }
00570          maxnumcol += 10;
00571          datasheet = new Field [maxnumcol];
00572          check_ptr(datasheet);
00573          hashtable  = new HashTable *[maxnumcol];
00574          check_ptr(hashtable);
00575          for (i=0; i<numcol; i++) {
00576             datasheet[i] = tmp_datasheet[i];
00577             hashtable[i] = tmp_hashtable[i];
00578          }
00579          for (i=numcol; i<maxnumcol; i++)  {
00580             hashtable[i] = new HashTable;
00581             check_ptr(hashtable[i]);
00582             datasheet[i] = 0;
00583          }
00584          if(tmp_datasheet){
00585            delete [] tmp_datasheet;
00586            tmp_datasheet=0;
00587          }
00588          if(tmp_hashtable){
00589            delete [] tmp_hashtable;
00590            tmp_hashtable=0;
00591          }
00592       }
00593       new_col = numcol++;
00594       datasheet[new_col].name(cname);
00595       datasheet[new_col].type('F');   // floating point number for the colum
00596       datasheet[new_col].index(new_col);
00597       datasheet[new_col].resize(numrec);
00598    }
00599    return  *this;
00600 }
00601 
00602 
00603 void Data::newcol(const std::string &cname,const Field& col)
00604 {
00605    unsigned i,n = col.size();
00606    if (n != numrec)
00607       warning("Data::newcol():%d,%d: size not conformable",numrec,n);
00608    this->newcol(cname);
00609    datasheet[new_col].col_struct = col.col_struct;
00610    datasheet[new_col].name(cname);   // cname override Field.name()
00611    datasheet[new_col].index(new_col);
00612    if (numrec < n) n = numrec;
00613    DataNode *tc = datasheet[new_col].dat_vec;
00614    for (i=0; i<n; i++) tc[i] = col.elem(i);
00615    for (i=n; i<numrec; i++) tc[i].missing = 1;
00616    datasheet[new_col].count_miss(numrec-n);
00617 }
00618 
00619 Data& Data::adjoin(Data& b)
00620 {
00621    unsigned n = b.num_rows();
00622    if (n != numrec) {
00623       warning("Data::adjoin(b):%d,%d: size unconformable: truncated",numrec,n);
00624    }
00625    if (!b.in_memory()) b.input_datasheet();
00626    unsigned i,nc=b.num_cols();
00627    for (i=0; i<nc; i++) {
00628       this->newcol("junk");
00629       datasheet[new_col] = b.datasheet[i];
00630       datasheet[new_col].index(new_col);
00631    }
00632    save_datasheet(0);
00633    return *this;
00634 }
00635 
00636 Data& Data::stack(Data& b)
00637 {
00638    warning("Data::stack(b): not yet available");
00639    return *this;
00640 }
00641 
00642 DataNode* Data::cell(const unsigned r,const unsigned c)
00643 {
00644    if (!data_in_memory) input_datasheet();   // data must be in memory
00645    if (r>=numrec || (c>=numcol&& c==0)) {
00646       warning("Data::cell(%d,%d): out of range",c,r);
00647       return 0;
00648    }
00649    else {
00650       return &datasheet[c][r];
00651    }
00652 }
00653 
00654 void Data::release(void)
00655 {
00656    if (datasheet) {
00657      delete [] datasheet;
00658      datasheet = 0;
00659    }
00660    if (hashtable) {
00661      for (int i=maxnumcol-1; i>=0; i--){
00662        if(hashtable[i]){
00663          delete hashtable[i];
00664          hashtable[i]=0;
00665        }
00666      }
00667      if(hashtable){
00668        delete [] hashtable; 
00669        hashtable=0;
00670      }
00671    }
00672 }
00673 
00674 
00675 Field Data::max(const std::string &cname)
00676 {
00677    if (!data_in_memory) input_datasheet();
00678    int i,nc=0;
00679    Vector<int> ivec;
00680    field_index_vec(ivec,cname);
00681    nc = ivec.size();
00682    Field xcol(nc);
00683    for (i=0; i<nc; i++) xcol[i] = datasheet[ivec[i]].max();
00684    return xcol;
00685 }
00686 
00687 Field Data::min(const std::string &cname)
00688 {
00689    if (!data_in_memory) input_datasheet();
00690    int i,nc=0;
00691    Vector<int> ivec;
00692    field_index_vec(ivec,cname);
00693    nc = ivec.size();
00694    Field xcol(nc);
00695    for (i=0; i<nc; i++) xcol[i] = datasheet[ivec[i]].min();
00696    return xcol;
00697 }
00698 
00699 Field Data::mean(const std::string &cname)
00700 {
00701    if (!data_in_memory) input_datasheet();
00702    int i,nc=0;
00703    Vector<int> ivec;
00704    field_index_vec(ivec,cname);
00705    nc = ivec.size();
00706    Field xcol(nc);
00707    for (i=0; i<nc; i++) xcol[i] = datasheet[ivec[i]].mean();
00708    return xcol;
00709 }
00710 
00711 Field Data::variance(const std::string &cname)
00712 {
00713    if (!data_in_memory) input_datasheet();
00714    int i,nc=0;
00715    Vector<int> ivec;
00716    field_index_vec(ivec,cname);
00717    nc = ivec.size();
00718    Field xcol(nc);
00719    for (i=0; i<nc; i++) xcol[i] = datasheet[ivec[i]].covariance();
00720    return xcol;
00721 }
00722 
00723 Field Data::sum(const std::string &cname)
00724 {
00725    if (!data_in_memory) input_datasheet();
00726    int i,nc=0;
00727    Vector<int> ivec;
00728    field_index_vec(ivec,cname);
00729    nc = ivec.size();
00730    Field xcol(nc);
00731    for (i=0; i<nc; i++) xcol[i] = datasheet[ivec[i]].sum();
00732    return xcol;
00733 }
00734 
00735 Field Data::sumsq(const std::string &cname)
00736 {
00737    if (!data_in_memory) input_datasheet();
00738    int i,nc=0;
00739    Vector<int> ivec;
00740    field_index_vec(ivec,cname);
00741    nc = ivec.size();
00742    Field xcol(nc);
00743    for (i=0; i<nc; i++) xcol[i] = datasheet[ivec[i]].sumsq();
00744    return xcol;
00745 }
00746 
00747 Field Data::product(const std::string &cname)
00748 {
00749    if (!data_in_memory) input_datasheet();
00750    int i,nc=0;
00751    Vector<int> ivec;
00752    field_index_vec(ivec,cname);
00753    nc = ivec.size();
00754    Field xcol(nc);
00755    for (i=0; i<nc; i++) xcol[i] = datasheet[ivec[i]].product();
00756    return xcol;
00757 }
00758 
00759 void Data::stat(void)
00760 {
00761    unsigned W = SESSION.output_precision+6;
00762    if (!data_in_memory) input_datasheet();
00763    unsigned i;
00764 
00765    std::cout << "\n  Name";
00766    for (i=1; i<numcol; i++) {   // first column intercept should be ignored
00767       if (datasheet[i].type()=='S') continue;
00768       std::cout << " " << std::setw(W) << datasheet[i].name();
00769    }
00770    std::cout << "\n";
00771 
00772    std::cout << "  Nobs";
00773    for (i=1; i<numcol; i++) {
00774       if (datasheet[i].type()=='S') continue;
00775       std::cout << " ";
00776       if (datasheet[i].type() == 'F') {
00777          std::cout << std::setw(W) <<  numrec-datasheet[i].nmiss();
00778       }
00779       else {
00780          std::cout << std::setw(W) << ".";
00781       }
00782    }
00783    std::cout << "\n";
00784 
00785    std::cout << "  Min ";
00786    for (i=1; i<numcol; i++) {
00787       if (datasheet[i].type()=='S') continue;
00788       std::cout << datasheet[i].min();
00789    }
00790    std::cout << "\n";
00791 
00792    std::cout << "  Max ";
00793    for (i=1; i<numcol; i++) {
00794       if (datasheet[i].type()=='S') continue;
00795       std::cout << datasheet[i].max();
00796    }
00797    std::cout << "\n";
00798 
00799    std::cout << "  Mean";
00800    for (i=1; i<numcol; i++) {
00801       if (datasheet[i].type()=='S') continue;
00802       std::cout << datasheet[i].mean();
00803    }
00804    std::cout << "\n";
00805 
00806    std::cout << "  S.D.";
00807    DataNode var;
00808    for (i=1; i<numcol; i++) {
00809       if (datasheet[i].type()=='S') continue;
00810          var = datasheet[i].covariance();
00811       if (!var.missing) var.double_val(std::sqrt(var.double_val()));
00812       std::cout << var;
00813    }
00814    std::cout << "\n\n";
00815 
00816    return;
00817 }
00818 
00819 doubleMatrix Data::mat(void)
00820 {
00821    doubleMatrix retval(numrec,numcol);
00822    if (numrec==0) throw exception("Data::mat(): empty data object");
00823    if (!data_in_memory) input_datasheet();
00824    DataNode *dat = 0;
00825    unsigned i,j;
00826    double *dpt;
00827    for (i=0; i<numrec; i++) {
00828       dpt = retval[i];
00829       for (j=1; j<numcol; j++) {   // first column intercept should be ignored
00830          if (datasheet[j].type() != 'S') {
00831             dat = &(datasheet[j].dat_vec[i]);
00832             if (dat->missing == 0) {
00833                dpt[j] = dat->double_val();
00834             }
00835             else {
00836                dpt[j] = 0.0;
00837             }
00838          }
00839       }
00840    }
00841    release_datasheet();
00842    return retval;
00843 }
00844 
00845 void Data::print(std::ostream& stream,const Vector<int> intvec,const int ic)
00846 {
00847    if (numrec==0) {
00848       std::cout << "\t empty data object\n" << std::flush;
00849       return;
00850    }
00851    if (!data_in_memory) input_datasheet();
00852    int nc = intvec.size();
00853    int kk;
00854    unsigned i,j,k,id;
00855    unsigned W = SESSION.output_precision+6;
00856    const char *str;
00857    char ch;
00858    stream.precision(SESSION.output_precision);
00859    DataNode *dat = 0;
00860    for (k=23,i=0; i<numrec; i++) {
00861       if (ic && i>=k) {
00862          k += 23;
00863          stream << "  more ... [q for quit] ";
00864          std::cin.get(ch);
00865          std::cin.seekg(0L,std::ios::beg);
00866          if (ch == 'q') break;
00867       }
00868       for (j=0; j<nc; j++) {
00869          kk = intvec[j];
00870          if (kk < 1) continue;   // first coloumn is reserved for intercept
00871          dat = &(datasheet[kk].dat_vec[i]);
00872          if (dat->missing) {
00873             stream << " " << std::setw(W) << ".";
00874          }
00875          else {
00876             if (datasheet[kk].type()=='S') {
00877                id = dat->unsigned_val();
00878                str = (const char*)(hashtable[kk]->find(id));
00879                stream << " " << std::setw(W) << str;
00880             }
00881             else {
00882                stream << " " << std::setw(W) << dat->double_val();
00883             }
00884          }
00885       }
00886       stream << "\n";
00887    }
00888    stream << std::flush;
00889    release_datasheet();
00890 }
00891 
00892 void Data::display(const std::string &fdname,const int ic)
00893 {
00894    Vector<int> intvec;
00895    field_index_vec(intvec,fdname);
00896    print(std::cout,intvec,ic);
00897 }
00898 
00899 void Data::save(const std::string &fname,
00900                    const int io_mode )
00901 {
00902    std::ofstream ofs;
00903    ofs.open(fname.c_str(),(OpenModeType)io_mode);
00904    if (!ofs) throw exception("Data::save(): cannot open file");
00905     Vector<int> intvec;
00906     field_index_vec(intvec);
00907     print(ofs,intvec,0);
00908     ofs.close();
00909 }
00910 
00911 std::ostream& operator<<(std::ostream& stream, Data& A)
00912 {
00913    unsigned nc = A.num_cols() - 1;      // first column is reserved for intercept
00914    if (nc == 0) return stream;
00915    Vector<int> intvec(nc);
00916    for (int i=0; i<nc; i++) intvec[i] = i+1;
00917    A.print(stream,intvec,1);
00918    return stream;
00919 }
00920 
00921 } ////////// end of namespace matvec
00922 

Generated on Thu Jun 16 17:13:38 2005 for Matvec by doxygen1.2.16