bioseq.h

Go to the documentation of this file.
00001 #ifndef BIOSEQ_H
00002 #define BIOSEQ_H
00003 
00012 #include "codon.h"
00013 #include <string>
00014 #include <iostream>
00015 #include <map>
00016 #include <algorithm>
00017 
00018 using namespace std;
00019 
00023 class bioseqexception : public exception {
00024    public:
00025       bioseqexception() throw() : errstr("bioseq error") { }
00026       bioseqexception(const string &msg) throw() : errstr(msg) { }
00027       ~bioseqexception() throw() { }
00028       const char* what() const throw() { return errstr.c_str(); }
00029    private:
00030       string errstr;
00031 };
00032 
00033 // some helper functions
00039 void printFasta(ostream &ous, const string& seq, int width=70);
00041 string reverseComplement(const string& seq);
00043 void reverseComplementInPlace(string& seq);
00048 void loadFastaIntoMap(const string &file, map<string,string> &store);
00052 enum sequenceType {PROTEINSEQ=1, DNASEQ=2, RNASEQ=4, NUCLEIC_ACID=6, GENERIC=7, UNKNOWN=0};
00056 void translate(string &pep, const string &seq, int begin, int end=-1);
00060 int countInternalStops(const string &seq);
00061 
00065 class bioseq {
00066    public:
00067       bioseq() : seq(), name(), title(), code(0) {}
00068       bioseq(const bioseq &s) : seq(s.seq), name(s.name),
00069                                 title(s.title), code(0) { }
00070       bioseq(const string &s) : seq(s), name(),title(), code(0) {}
00074       bioseq(const string &n, const string &s) 
00075          : seq(s), name(n), title(), code(0) {}
00080       bioseq(const string &n, const string &s, const string &t) 
00081          : seq(s), name(n), title(t), code(0) {}
00082       //virtual ~bioseq() { 
00083       ~bioseq() { if (code != 0) delete[] code; }
00084 
00085       void clear() { if (code != 0) delete[] code; code=0; seq.clear(); name.clear(); title.clear(); }
00086 
00089       void assign(const string &s) { 
00090          seq=s; 
00091          if (code != 0) {
00092             delete[] code;
00093             code=0;
00094          } 
00095       }
00097       
00103       void read(const string &file);
00104 
00123       bool read(istream &ins, string &header) throw (bioseqexception);
00124 
00130       string& importFasta(const string &fas);
00131 
00133 
00139       const string& toString() const { return seq; }
00140 
00143       void printFastaWithHeader(ostream &ous, int width=70);
00145       void printFasta(ostream &ous, int width=70);
00150       friend ostream& operator<<(ostream &ous, const bioseq &s);
00151 
00156       string substr(int b, int e) const; 
00157       string substring(const int b, const int len) const throw (bioseqexception)
00158       { if (b>=length() || b < 0) throw bioseqexception("bioseq::substring start index out of range"); }
00159       string substring(const int b) const throw (bioseqexception)
00160       { if (b>=length() || b < 0) throw bioseqexception("bioseq::substring start index out of range");
00161          return seq.substr(b); }
00162 
00166       bioseq subseq(int b, int e) const throw (bioseqexception);
00167 
00172       bioseq subsequence(int b, int len=-1) const throw (bioseqexception);
00173       void setName(const string &n) { name=n; }
00174       void setTitle(const string &t) { title=t; }
00175       const string& getName() const { return name; }
00176       const string& getTitle() const { return title; }
00180       const string& getSequence() const { return seq; }
00181 
00185       void rvc();
00189       map<char, double> getFrequency() const;
00190 
00194       pair<double,double> computeEntropy() const {
00195          return entropy(seq); }
00199       pair<double,double> computeEntropy(int b, int e) const throw(bioseqexception) {
00200          if (b<0 || e>seq.length()-1) { 
00201             cerr << "index error in computeEntropy()"; 
00202             throw bioseqexception("index out of range inside bioseq::computeEntropy");
00203             //exit(1);
00204          }
00205          return entropy(seq.substr(b,e-b+1)); }
00206 
00211       static pair<double,double> entropy(const string &seq);
00212       int length() const { return seq.length(); }
00219       const int* getcode() const;
00220 
00224       char operator[](const int i) const { return seq[i]; }
00225       bioseq& operator=(const bioseq &s);
00226       bioseq& operator+=(const bioseq &s) { seq += s.seq; return *this; }
00227       bioseq& operator=(const string &str);
00229       bool empty() { if (seq.empty()) return true; else return false; }
00230       void randomize() { 
00231          random_shuffle(seq.begin(), seq.end()); 
00232          delete[] code;
00233          code=0; 
00234       }
00236       void toLowerCase();
00237       void toUpperCase();
00238       virtual sequenceType getSequenceType() const { return GENERIC; }
00239 
00240    protected:
00241       string seq;
00242       string name;
00243       string title;
00247       mutable int* code;
00248 };
00249 
00251 int aachar2num(char a);
00253 char aanum2char(int c);
00254 
00258 class Protein : public bioseq {
00259    public:
00260       Protein() : bioseq() { codefreq[0]=-1; }
00261       Protein(const Protein &s) : bioseq(s)  { 
00262          for (int i=0; i<numcodes; i++) codefreq[i]=s.codefreq[i]; 
00263       }
00264       Protein(const string &str) : bioseq(str) { codefreq[0]=-1; }
00265       Protein(const bioseq &s) : bioseq(s)  { codefreq[0]=-1; }
00266       Protein(const string &n, const string &s) 
00267          : bioseq (n,s)  { codefreq[0]=-1; }
00268       Protein subseq(int b, int e) const {
00269          return bioseq::subseq(b,e); }
00270       ~Protein() { /* parent destructor is enough */ }
00271       Protein& operator=(const Protein &p);
00272       Protein& operator=(const string &str);
00273 
00274       static const char* threeLetterCode(const char A) 
00275       { return symbols[2*(toupper(A)-'A')]; }
00276       static const char* aminoAcidFullName(const char A) 
00277       { return symbols[2*(toupper(A)-'A')+1]; }
00278 
00280       double relativeEntropy() const;
00281       double relativeEntropyUniform() const;
00287       //const int* encode() const;
00290       //const int* getcode() const { if (code == 0) encode(); return code; }
00294       const double* getCodeFrequence();
00295       static const int numcodes=27; // including *
00296       static const char* oneLetterSymbols;
00299       sequenceType getSequenceType() const { return PROTEINSEQ; }
00300       bool hasStart() const { return seq[0] == 'M'; }
00301       bool hasStop() const { return seq[seq.length()-1] == '*'; }
00302       bool hasInternalStop() const;
00303       int countInternalStops();
00304 
00305    private:
00314       mutable double codefreq[27];
00315       //mutable int* code;
00320       static const char* symbols[];
00322       static const double aafreq[];
00325       //static const double aafrequniform[];
00326 };
00327 
00332 class DNA : public bioseq {
00333    public:
00334       DNA() : bioseq() { }
00335       DNA(const DNA &s) : bioseq(s) { }
00336       DNA(const string &s) : bioseq(s)  { }
00337       DNA(const string &n, const string &s) 
00338          : bioseq(n,s)  {}
00339       DNA(const string &n, const string &s, const string &t) 
00340          : bioseq(n,s,t)  { }
00341 
00342       DNA(const bioseq &s) : bioseq(s)  { }
00343       //~DNA() { if (code != 0) delete[] code; }
00344       //~DNA() { bioseq::~bioseq(); }
00345       ~DNA() { /* parent destructor is enought */ }
00346       DNA& operator=(const DNA &s);
00347       //DNA& operator+=(const DNA &s) { return bioseq::operator+=(s); }
00350       DNA& operator=(const string &str);
00351 
00355       Protein translate() const;
00362       Protein translate(int begin, int end=-1) const;
00363       DNA subseq(int b, int e) const {
00364          return bioseq::subseq(b,e); }
00370       const int* getcodeNuc() const;
00371       sequenceType getSequenceType() const { return DNASEQ; }
00374       double GCContent() const;
00375       //void revcomp() { seq=reverseComplement(seq); }
00378       void revcomp() { reverseComplementInPlace(seq); }
00384       Protein longestORFForward(int &b, int &e) const;
00388       void longestORFForward(Protein &p, int &b, int &e) const;
00391       Protein longestORFReverse(int &b, int &e) const;
00392       void longestORFReverse(Protein &p, int &b, int &e) const;
00393       Protein longestORF(char strand, int &b, int &e) const;
00394       Protein longestORF(int &b, int &e) const;
00395       void longestORF(Protein &p, int &b, int &e) const;
00396 
00397       static const codon& getCodonTable() { return codontable; }
00398 
00399    protected:
00405       static codon codontable;
00406       //mutable int* code;
00407 };
00408 
00411 class mRNA : public DNA {
00412    public: 
00413       mRNA() : DNA() { }
00416       mRNA(const string &s);
00417 
00418    private:
00421       int cdsb;
00423       int cdse;
00424       Protein prt;
00425 };
00426 
00427 
00428 #endif

Generated on Wed Oct 14 21:49:10 2009 for Softwares from Orpara by  doxygen 1.5.6