hatrees.h

Go to the documentation of this file.
00001 // file hatrees.h  replaces forest.h
00002 // template class;  no *.cpp file associated with it
00003 #ifndef DISET_H
00004 #define DISET_H
00005 //#define HAVE_NAMESPACE_STD
00006 
00007 #include <string>
00008 #include <map>
00009 //#include <hash_map.h>
00010 //#include <multimap.h> included by <map>
00011 #include <utility>   // contains pair.h
00012 #include <fstream>
00013 #include <sstream>
00014 //#include "libpq++.h"
00015 //#include <pqxx>
00016 #include <set>
00017 #include <vector>
00018 #include <iostream>
00019 #include <cstdlib>
00020 
00021 //#include <istream.h>
00022 //
00031 //#define USE_HASH_MAP  // will insert duplicate key
00032 
00033 // there are a lot std utilites used in this 
00034 // header, the following statement polute the namespace
00035 using namespace std;
00036 //using namespace pqxx;
00037 
00038 template<class T> struct node
00039 {
00040         node() :  rank(0), key() { parent=this; }  // point to self
00041         node(const T &k) : rank(0), key(k) { parent=this; }
00042         node(const T &k, node* p) : parent(p), rank(0), key(k) { }
00043         /* set parent = this is essential */
00044         node(const node &n) { key = n.key; parent = this;  rank = n.rank; }
00045         // if use parent = n.parent then segmentation fault
00046         ~node() { /*nothing needs to be done*/ }
00047         bool operator==(const node& n) const { return key==n.key; }
00048         bool operator<(const node &n) const { return key < n.key; }
00049         //node& operator=(const node &n) { if (&n != this) { key = n.key; parent = n.parent; rank = n.rank; } return *this; }
00050         node& operator=(const node &n) { if (&n != this) { key = n.key; parent = this; rank = n.rank; } return *this; }
00051 
00052         T key;   // stored in the map<key, value>
00053         node<T>* parent;
00054         int rank;
00055 };
00056 
00057 /* tree operations */
00059 /* x is not altered */
00060 template<class T> node<T>* getRoot(node<T> *x) {
00061         if (x != x->parent)  x->parent = getRoot(x->parent); 
00062         return x->parent;
00063 }
00064 template<class T> void mergeRoot(node<T> *&r1, node<T> *&r2) {
00065         if(r1 == r2) return;  // if both are the same set
00066         if (r1->rank > r2->rank) r2->parent = r1;
00067         else {
00068                 r1->parent = r2;
00069                 if (r1->rank == r2->rank) r2->rank++;
00070         }
00071 }
00072 template<class T> void join(node<T> &x, node<T> &y) {
00073         node<T> *r1 = getRoot(&x);  // this is necessary for the compiler
00074         node<T> *r2 = getRoot(&y);
00075         mergeRoot(r1, r2);
00076         //mergeRoot(getRoot(&x), getRoot(&y));  // compiler got confused 
00077 }
00078 template<class T> void join(node<T>* x, node<T>* y) {
00079         node<T> *r1 = getRoot(x);  // this is necessary for the compiler
00080         node<T> *r2 = getRoot(y);
00081         mergeRoot(r1, r2);
00082         //mergeRoot(getRoot(x), getRoot(y));  // compiler got confused 
00083 }
00084 
00086 /* for hash_map of string as key, hash is not working
00087  * should be implemented later */
00088 /*
00089 template<> struct hash<string> {
00090         size_t operator()(const string &key) const {
00091                 size_t res=0;
00092                 typedef string::const_iterator CI;
00093                 CI  p = key.begin();
00094                 CI end= key.end();
00095 
00096                 while (*p) {
00097                         res = (res<<4) + *p++;
00098                         size_t g = res & 0xF0000000L;
00099                         if (g) res ^= g >> 24;
00100                         res &= ~g;
00101                 }
00102                 return res;
00103         }
00104 };
00105 */
00107 //   hatrees class definition /////////////////////////
00115 template<class T> class hatrees
00116 {
00117         public:
00118                 hatrees() {};
00119                 ~hatrees();
00120                 void clear() { nodes.clear(); result.clear(); }
00122                 void readFromMap(const std::multimap<T, T> &m);
00123                 void readFromFile(const string &file);
00124       /* only gecods.cpp needs this function
00125        * I am not compiling gecods.cpp anymore
00126        */
00127                 //void readFromDB(PgDatabase &db);
00128                 //void readFromDB(pqxx::result &qres);
00129 
00130                 /********** output ********************/
00131                 /* using representative as cluster id, results inserted into tab */
00132                 //void loadTable(PgDatabase &db, string tab);
00133                 //void loadTable(pqxx::connection &db, string tab);
00134 
00135                 /* changes the parent pointer of the nodes      */
00136                 void showStore(ostream &os) const;  
00137 
00138                 /* number of unique members */
00139                 int getNodeCount() const { return nodes.size(); }
00142                 void showCluster(ostream &os, bool reverse= true);
00147                 void showClusterIntId(ostream &ous, int &id);
00148 
00150                 int showClusterByLine(ostream &os);
00151                 set<T> keyset() const;
00152                 vector<T> keyarray() const;
00153 
00156                 void clusterArray(vector<set<T> > &vecset);
00157 
00163                 std::multimap<T,T>& getCluster() { if (result.empty()) transform(); return result; }
00164 
00165 #ifdef USE_HASH_MAP
00166         //typename hash_map<T, node<T>* >::iterator niterator;
00167         //typename hash_map<T, node<T>* >::const_iterator const_niterator;
00168         typedef typename hash_map<T, node<T>* >::iterator niterator;
00169         typedef typename hash_map<T, node<T>* >::const_iterator const_niterator;
00170 #else
00171         typedef typename map<T, node<T>* >::iterator niterator;
00172         typedef typename map<T, node<T>* >::const_iterator const_niterator;
00173         //typename map<T, node<T>* >::iterator niterator;
00174         //typename map<T, node<T>* >::const_iterator const_niterator;
00175 #endif
00176 
00177         private:
00178 #ifdef USE_HASH_MAP 
00179                 hash_map<T, node<T>* > nodes;
00180 #else
00181                 map<T, node<T>* > nodes;
00182 #endif
00183 
00186                 //multimap<T, T> cluster;  // sorted result, loaded after
00187                 std::multimap<T, T> result;  // sorted result in map format
00188 
00189                 /* transform the nodStore cluster into mutimaped cluster
00190                  * in a table format cluster_id -> members */
00191                 void transform();
00192 };
00193 
00195 template<class T> hatrees<T>::~hatrees() { 
00196         niterator MI = nodes.begin();
00197         while (MI != nodes.end()) {
00198                 delete MI->second;
00199                 MI++;
00200         }
00201 }
00202 
00203 template<class T> void hatrees<T>::readFromMap(const std::multimap<T, T> &m) {
00204         pair<niterator, bool> p1, p2;
00205         T f1, f2; 
00206         typename std::multimap<T, T>::const_iterator imi = m.begin();
00207 
00208         while (imi != m.end()) {
00209                 f1 = imi->first;
00210                 f2 = imi->second;
00211                 p1 = nodes.insert(pair<T, node<T>* >(f1, new node<T>(f1)));
00212                 p2 = nodes.insert(pair<T, node<T>* >(f2, new node<T>(f2)));
00213                 join(p1.first->second, p2.first->second);
00214                 imi++;
00215         }
00216 }
00217 
00218 template<class T> void hatrees<T>::readFromFile(const string &file) {
00219         ifstream IN(file.c_str());
00220         if (IN.fail()) {
00221                 cerr << "reading from " << file << "failed\n";
00222                 exit(1);
00223         }
00224         string ln;
00225         pair<niterator, bool> p1, p2;
00226         T f1, f2; 
00227         //hash_map<T, T> xx;   // hash_map inserts duplicate key under template
00228         //hash_map<T, int> hm;
00229         //map<T, T> yy;
00230         int id=0;
00231 
00232         getline(IN, ln);
00233         while (!IN.eof()) {
00234                 istringstream ist(ln);
00235                 ist >> f1 >> f2;
00236                 /*
00237                 xx.insert(pair<T, T>(f1, f2));
00238                 yy.insert(pair<T, T>(f1, f2));
00239                 if (hm.find(f1) == hm.end()) {
00240                         hm.insert(pair<T, int>(f1, id++));
00241                 }
00242                 if (hm.find(f2) == hm.end()) {
00243                         hm.insert(pair<T, int>(f2, id++));
00244                 }
00245                 */
00246                 p1 = nodes.insert(pair<T, node<T>* >(f1, new node<T>(f1)));
00247                 p2 = nodes.insert(pair<T, node<T>* >(f2, new node<T>(f2)));
00248                 join(p1.first->second, p2.first->second);
00249                 getline(IN, ln);
00250         }
00251         /*
00252         cout << "Size of hash_map<string, string> is " << xx.size() << endl;
00253         cout << "Size of map<string, string> is " << yy.size() << endl;
00254         cout << "Size of hash_map<string, int> is " << hm.size() << endl;
00255         hash_map<T, T>::iterator xxi = xx.begin();
00256         map<T,T>::iterator yyi = yy.begin();
00257         while (xxi != xx.end()) {
00258                 cout << xxi->first << "\t" << xxi->second << endl;
00259                 xxi++;
00260         }
00261         cout << "================================\n";
00262         while (yyi != yy.end()) {
00263                 cout << yyi->first << "\t" << yyi->second << endl;
00264                 yyi++;
00265         }
00266         cout << "-----------------------------\n";
00267         hash_map<T, int>::iterator hmi = hm.begin();
00268         while (hmi != hm.end()) {
00269                 cout << hmi->first << "\t" << hmi->second << endl;
00270                 hmi++;
00271         }
00272         */
00273 }
00274 
00275 /* db has the result ready; will use field 1, and 2 as input */
00276 //template<class T> void hatrees<T>::readFromDB(PgDatabase &db) {
00277 /* removing this function so that this library will not
00278  * dependent on postgres
00279 template<class T> void hatrees<T>::readFromDB(pqxx::result &qres) {
00280         pair<niterator, bool> p1, p2;
00281         T f1, f2; 
00282         string ln, tmp;
00283 
00284         int i;
00285         for (i=0; i< qres.size(); i++) {
00286                 //ln = db.GetValue(i,0);
00287                 qres[i][0].to(ln);
00288       qres[i][1].to(tmp);
00289                 //ln.append("\t").append(db.GetValue(i,1));
00290                 ln.append("\t").append(tmp);
00291       
00292                 istringstream ist(ln);
00293                 ist >> f1 >> f2;
00294                 p1 = nodes.insert(pair<T, node<T>* >(f1, new node<T>(f1)));
00295                 p2 = nodes.insert(pair<T, node<T>* >(f2, new node<T>(f2)));
00296                 join(p1.first->second, p2.first->second);
00297         }
00298 }
00299 */
00300 /* load into relational table(member_key, member_representative) */
00301 //template<class T> void hatrees<T>::loadTable(PgDatabase &db,string tab) {
00302 /* disable to make this library less dependent on postgress 
00303  * installation
00304 template<class T> void hatrees<T>::loadTable(pqxx::connection &db,string tab) {
00305         string queryBase = "insert into " + tab + " values(";
00306         
00307         niterator it = nodes.begin();
00308         node<T>* root;
00309         cerr << "Loading into table member_key\tgroup_representative\n";
00310    work Xaction(db, "forinsertion");
00311 
00312         while (it != nodes.end()) {
00313                 root = getRoot(it->second->parent);
00314                 string query = queryBase;
00315                 ostringstream ous;
00316                 ous << '\'' << it->first << "','" << root->key << "')";
00317                 query += ous.str();
00318       try {
00319          Xaction.exec(query);
00320       }
00321       catch (exception &err) {
00322          cerr << err.what() << endl;
00323          exit(1);
00324       }
00325       Xaction.commit();
00326       */
00327       /*
00328                 if (!db.ExecCommandOk(query.c_str())) {
00329                         cerr << query << " failed\n";
00330                         exit(1);
00331                 }
00332       */
00333 /*
00334                 it++;
00335         }
00336         cerr << "Total number of unique keys is " << nodes.size() << endl;
00337 }
00338 */
00339 
00340 /* members->cluster_id sorted according to members if using <map>
00341  * if using hash_map then not in any order */
00342 template<class T> void hatrees<T>::showStore(ostream &os) const {
00343         const_niterator it = nodes.begin();
00344         node<T>* root;
00345         os << "key\tcluster_representative\n";
00346         while (it != nodes.end()) {
00347                 root = getRoot(it->second->parent);
00348                 os << it->first << "\t" << root->key << endl;
00349                 it++;
00350         }
00351         os << "Total number of unique keys is " << nodes.size() << endl;
00352         cerr << "Total number of unique keys is " << nodes.size() << endl;
00353 }
00354 
00355 /* return the key as a set<T> */
00356 template<class T> set<T> hatrees<T>::keyset() const {
00357         set<T> tmpset;
00358         const_niterator it = nodes.begin();
00359         while (it != nodes.end()) {
00360                 tmpset.insert(it->first);
00361                 it++;
00362         }
00363         return tmpset;
00364 }
00365 
00366 template<class T> vector<T> hatrees<T>::keyarray()  const {
00367         vector<T> tmp;
00368         const_niterator it = nodes.begin();
00369         while (it != nodes.end()) {
00370                 //cout << it->first << "   ";  //debug
00371                 tmp.push_back(it->first);
00372                 it++;
00373         }
00374         return tmp;
00375 }
00376 
00377 /* return a vector of clusters as sets of members */
00378 template<class T> void hatrees<T>::clusterArray(vector< set<T> > &vecset) {
00379         if (result.empty()) transform();
00380         vecset.clear();  // just in case it has something already in it
00381 
00382         typename std::multimap<T, T>::const_iterator mi = result.begin();
00383         while (mi != result.end()) {
00384                 set<T> tmpset;
00385                 tmpset.insert(mi->second);
00386                 //os << mi->second;
00387                 T cluster_id = mi->first;
00388                 mi++;
00389                 while (mi != result.end() && mi->first == cluster_id) {
00390                         //os << '\t' << mi->second;
00391                         tmpset.insert(mi->second);
00392                         mi++;
00393                 }
00394                 vecset.push_back(tmpset);
00395                 //os << endl;
00396         }
00397 }
00398 
00399 // you can provide a inital id to name the clusters
00400 // This function will increment this id for each cluster
00401 template<class T> void hatrees<T>::showClusterIntId(ostream &ous, int &id) {
00402         if (result.empty()) transform();
00403         typename std::multimap<T, T>::const_iterator mi = result.begin();
00404         while (mi != result.end()) {
00405                 set<T> tmpset;
00406                 ous << mi->second << '\t' << id << endl;
00407                 //tmpset.insert(mi->second);
00408                 T cluster_id = mi->first;
00409                 mi++;
00410                 while (mi != result.end() && mi->first == cluster_id) {
00411                         //tmpset.insert(mi->second);
00412                         ous << mi->second << '\t' << id << endl;
00413                         mi++;
00414                 }
00415                 //vecset.push_back(tmpset);
00416                 //os << endl;
00417                 ++id;
00418         }
00419 }
00420 
00421 /* output in cluster_id -->members 
00422  * Table format, good for relational database
00423  * */
00424 template<class T> void hatrees<T>::showCluster(ostream &os, bool reverse) {
00425         if (result.empty()) transform();
00426 
00427         typename std::multimap<T, T>::const_iterator mi = result.begin();
00428         if (reverse) {
00429                 while (mi != result.end()) {
00430                         os << mi->second << '\t' << mi->first << endl;
00431                         mi++;
00432                 }
00433         }
00434         else {
00435                 while (mi != result.end()) {
00436                         os << mi->first << '\t' << mi->second << endl;
00437                         mi++;
00438                 }
00439         }
00440         //os << "Total members: " << result.size() << endl;
00441         //for db loading, this line causes trouble
00442         cerr << "Total members: " << result.size() << endl;
00443 }
00444 
00445 template<class T> void hatrees<T>::transform() {
00446         niterator it = nodes.begin();
00447         node<T>* root;
00448         while (it != nodes.end()) {
00449                 root = getRoot(it->second->parent);
00450                 result.insert(pair<T, T>(root->key, it->first));
00451                 it++;
00452         }
00453 }
00454 
00457 template<class T> int hatrees<T>::showClusterByLine(ostream &os) {
00458         if (result.empty()) transform();
00459         os << "Output format: one cluster per line\n\n";
00460         int clusterCnt=0;
00461         typename std::multimap<T, T>::const_iterator mi = result.begin();
00462         while (mi != result.end()) {
00463                 clusterCnt++;
00464                 os << mi->second;
00465                 T cluster_id = mi->first;
00466                 mi++;
00467                 while (mi != result.end() && mi->first == cluster_id) {
00468                         os << '\t' << mi->second;
00469                         mi++;
00470                 }
00471                 os << "\n\n";
00472         }
00473         os << "Total " << clusterCnt << " clusters\n";
00474         return clusterCnt;
00475 }
00476 
00477 #endif

Generated on Wed Oct 14 21:49:11 2009 for Softwares from Orpara by  doxygen 1.5.6