00001 #ifndef BIOSEQ_H
00002 #define BIOSEQ_H
00003
00012 #include "codon.h"
00013 #include <string>
00014 #include <iostream>
00015 #include <map>
00016 #include <algorithm>
00017
00018 using namespace std;
00019
00023 class bioseqexception : public exception {
00024 public:
00025 bioseqexception() throw() : errstr("bioseq error") { }
00026 bioseqexception(const string &msg) throw() : errstr(msg) { }
00027 ~bioseqexception() throw() { }
00028 const char* what() const throw() { return errstr.c_str(); }
00029 private:
00030 string errstr;
00031 };
00032
00033
00039 void printFasta(ostream &ous, const string& seq, int width=70);
00041 string reverseComplement(const string& seq);
00043 void reverseComplementInPlace(string& seq);
00048 void loadFastaIntoMap(const string &file, map<string,string> &store);
00052 enum sequenceType {PROTEINSEQ=1, DNASEQ=2, RNASEQ=4, NUCLEIC_ACID=6, GENERIC=7, UNKNOWN=0};
00056 void translate(string &pep, const string &seq, int begin, int end=-1);
00060 int countInternalStops(const string &seq);
00061
00065 class bioseq {
00066 public:
00067 bioseq() : seq(), name(), title(), code(0) {}
00068 bioseq(const bioseq &s) : seq(s.seq), name(s.name),
00069 title(s.title), code(0) { }
00070 bioseq(const string &s) : seq(s), name(),title(), code(0) {}
00074 bioseq(const string &n, const string &s)
00075 : seq(s), name(n), title(), code(0) {}
00080 bioseq(const string &n, const string &s, const string &t)
00081 : seq(s), name(n), title(t), code(0) {}
00082
00083 ~bioseq() { if (code != 0) delete[] code; }
00084
00085 void clear() { if (code != 0) delete[] code; code=0; seq.clear(); name.clear(); title.clear(); }
00086
00089 void assign(const string &s) {
00090 seq=s;
00091 if (code != 0) {
00092 delete[] code;
00093 code=0;
00094 }
00095 }
00097
00103 void read(const string &file);
00104
00123 bool read(istream &ins, string &header) throw (bioseqexception);
00124
00130 string& importFasta(const string &fas);
00131
00133
00139 const string& toString() const { return seq; }
00140
00143 void printFastaWithHeader(ostream &ous, int width=70);
00145 void printFasta(ostream &ous, int width=70);
00150 friend ostream& operator<<(ostream &ous, const bioseq &s);
00151
00156 string substr(int b, int e) const;
00157 string substring(const int b, const int len) const throw (bioseqexception)
00158 { if (b>=length() || b < 0) throw bioseqexception("bioseq::substring start index out of range"); }
00159 string substring(const int b) const throw (bioseqexception)
00160 { if (b>=length() || b < 0) throw bioseqexception("bioseq::substring start index out of range");
00161 return seq.substr(b); }
00162
00166 bioseq subseq(int b, int e) const throw (bioseqexception);
00167
00172 bioseq subsequence(int b, int len=-1) const throw (bioseqexception);
00173 void setName(const string &n) { name=n; }
00174 void setTitle(const string &t) { title=t; }
00175 const string& getName() const { return name; }
00176 const string& getTitle() const { return title; }
00180 const string& getSequence() const { return seq; }
00181
00185 void rvc();
00189 map<char, double> getFrequency() const;
00190
00194 pair<double,double> computeEntropy() const {
00195 return entropy(seq); }
00199 pair<double,double> computeEntropy(int b, int e) const throw(bioseqexception) {
00200 if (b<0 || e>seq.length()-1) {
00201 cerr << "index error in computeEntropy()";
00202 throw bioseqexception("index out of range inside bioseq::computeEntropy");
00203
00204 }
00205 return entropy(seq.substr(b,e-b+1)); }
00206
00211 static pair<double,double> entropy(const string &seq);
00212 int length() const { return seq.length(); }
00219 const int* getcode() const;
00220
00224 char operator[](const int i) const { return seq[i]; }
00225 bioseq& operator=(const bioseq &s);
00226 bioseq& operator+=(const bioseq &s) { seq += s.seq; return *this; }
00227 bioseq& operator=(const string &str);
00229 bool empty() { if (seq.empty()) return true; else return false; }
00230 void randomize() {
00231 random_shuffle(seq.begin(), seq.end());
00232 delete[] code;
00233 code=0;
00234 }
00236 void toLowerCase();
00237 void toUpperCase();
00238 virtual sequenceType getSequenceType() const { return GENERIC; }
00239
00240 protected:
00241 string seq;
00242 string name;
00243 string title;
00247 mutable int* code;
00248 };
00249
00251 int aachar2num(char a);
00253 char aanum2char(int c);
00254
00258 class Protein : public bioseq {
00259 public:
00260 Protein() : bioseq() { codefreq[0]=-1; }
00261 Protein(const Protein &s) : bioseq(s) {
00262 for (int i=0; i<numcodes; i++) codefreq[i]=s.codefreq[i];
00263 }
00264 Protein(const string &str) : bioseq(str) { codefreq[0]=-1; }
00265 Protein(const bioseq &s) : bioseq(s) { codefreq[0]=-1; }
00266 Protein(const string &n, const string &s)
00267 : bioseq (n,s) { codefreq[0]=-1; }
00268 Protein subseq(int b, int e) const {
00269 return bioseq::subseq(b,e); }
00270 ~Protein() { }
00271 Protein& operator=(const Protein &p);
00272 Protein& operator=(const string &str);
00273
00274 static const char* threeLetterCode(const char A)
00275 { return symbols[2*(toupper(A)-'A')]; }
00276 static const char* aminoAcidFullName(const char A)
00277 { return symbols[2*(toupper(A)-'A')+1]; }
00278
00280 double relativeEntropy() const;
00281 double relativeEntropyUniform() const;
00287
00290
00294 const double* getCodeFrequence();
00295 static const int numcodes=27;
00296 static const char* oneLetterSymbols;
00299 sequenceType getSequenceType() const { return PROTEINSEQ; }
00300 bool hasStart() const { return seq[0] == 'M'; }
00301 bool hasStop() const { return seq[seq.length()-1] == '*'; }
00302 bool hasInternalStop() const;
00303 int countInternalStops();
00304
00305 private:
00314 mutable double codefreq[27];
00315
00320 static const char* symbols[];
00322 static const double aafreq[];
00325
00326 };
00327
00332 class DNA : public bioseq {
00333 public:
00334 DNA() : bioseq() { }
00335 DNA(const DNA &s) : bioseq(s) { }
00336 DNA(const string &s) : bioseq(s) { }
00337 DNA(const string &n, const string &s)
00338 : bioseq(n,s) {}
00339 DNA(const string &n, const string &s, const string &t)
00340 : bioseq(n,s,t) { }
00341
00342 DNA(const bioseq &s) : bioseq(s) { }
00343
00344
00345 ~DNA() { }
00346 DNA& operator=(const DNA &s);
00347
00350 DNA& operator=(const string &str);
00351
00355 Protein translate() const;
00362 Protein translate(int begin, int end=-1) const;
00363 DNA subseq(int b, int e) const {
00364 return bioseq::subseq(b,e); }
00370 const int* getcodeNuc() const;
00371 sequenceType getSequenceType() const { return DNASEQ; }
00374 double GCContent() const;
00375
00378 void revcomp() { reverseComplementInPlace(seq); }
00384 Protein longestORFForward(int &b, int &e) const;
00388 void longestORFForward(Protein &p, int &b, int &e) const;
00391 Protein longestORFReverse(int &b, int &e) const;
00392 void longestORFReverse(Protein &p, int &b, int &e) const;
00393 Protein longestORF(char strand, int &b, int &e) const;
00394 Protein longestORF(int &b, int &e) const;
00395 void longestORF(Protein &p, int &b, int &e) const;
00396
00397 static const codon& getCodonTable() { return codontable; }
00398
00399 protected:
00405 static codon codontable;
00406
00407 };
00408
00411 class mRNA : public DNA {
00412 public:
00413 mRNA() : DNA() { }
00416 mRNA(const string &s);
00417
00418 private:
00421 int cdsb;
00423 int cdse;
00424 Protein prt;
00425 };
00426
00427
00428 #endif