Frequency of nucleotides and oligonucleotides in a sequence

20000000){die("Error: Maximum allowed size of file is 20 MB");} $file_name=strtolower($_FILES['userfile']['name']); //LEER CADENA $file=$_FILES['userfile']['tmp_name']; $zd = fopen($file, "r"); $file_content=fread($zd,20000000); fclose($zd); print "

Uploaded file: $file_name
\n"; } else { die("No sequences submitted. Please go back and try again"); } } // At this point $file_type contains whatever has been submitted // The first problem is to know whether an unique sequence or multiple sequences had been submitted $greater_than_signs=substr_count ($file_content,">"); // Three options are possible: // no ">" are contained within $sequence ($greater_than_signs=0), // only 1 ">" is contained $greater_than_signs(=1), // several ">" are contained (so that several sequences have been submitted and $greater_than_signs is >1) // We must also remove all useless information. // Method: Convert content at $sequence to a variable containing 1 unique sequence per line. // A line break (\n) will separate each sequence. if ($greater_than_signs==0){ // Only the sequence has been submitted, so useless characters must be removed $sequence=preg_replace("/\W|\d/","",$file_content); }elseif ($greater_than_signs>1){ // One or more sequences with their identification after ">" have been submitted, // As identification is not required, the lines containg ">" are removed $file_content=substr($file_content,strpos($file_content,">")+1); // remove first and whatever is before it $theseqs=preg_split("/>/",$file_content,-1,PREG_SPLIT_NO_EMPTY); // get all sequences to array $theseqs. The first line of each element in the array is the decription of the sequence without the ">" sign $sequence=""; // in this variable all sequences will be stored (one sequence per line) foreach($theseqs as $k => $v){ $v=preg_replace("/\r/","\n",$v); // replace returns by linebreaks (may be present or not) $v=preg_replace("/\n\n/","\n",$v); // remove subsequence linebreaks (may be present or not) $v=preg_replace("/\n\n/","\n",$v); // again (just in case) while (substr_count($v,"\n\n")>0){ $v=preg_replace("/\n\n/","\n",$v); // again (just in case) } if (substr_count($v,"\n")==0){ die("Error:
Input information is not a correct Fasta file."); // just a simple checking } $pos=strpos($v,"\n"); // find first position of linebreak. The text from position 0 to $post will be the descriptor of the sequence if($pos==1){die("Error:
Input information is not a correct Fasta file.");} // just a simple checking $v=substr($v,$pos+1); // The descriptor of the sequence is removed and at this point $v will only contain the sequence $v=preg_replace("/\W|\d/","",$v); // Non-word characters (\W) (as for example linebreaks) and digists (\d) are remove // Here, $v will contain only the ACGT code in one line $sequence.=$v."\n"; // The sequences are added to variable $sequence, so that in each line one sequence is contained } $theseqs=array(); // Free memory } // get additional data $oligo_len=$_POST["len"]; $strands=$_POST["strands"]; // when length of query sequence is 0 => error if (strlen($sequence)==0){die("Error: query sequence not provided. Plase go back and try again.");} // print the form with the sequence print_form ("",$oligo_len,$strands); // when frequencies at both strands are requested, place in one line the sequence and its reverse complement if ($strands==2){$sequence.=" ".RevComp($sequence); } // compute requested. Data is saved to array $results if ($oligo_len=="1" or $oligo_len=="2" or $oligo_len=="3" or $oligo_len=="4" or $oligo_len=="5" or $oligo_len=="6" or $oligo_len=="7" or $oligo_len=="8"){ $result=find_oligos($sequence,$oligo_len); }elseif($oligo_len=="1s" or $oligo_len=="2s" or $oligo_len=="3s" or $oligo_len=="4s" or $oligo_len=="5s" or $oligo_len=="6s" or $oligo_len=="7s" or $oligo_len=="8s"){ $result=find_oligos($sequence,$oligo_len); $result=standarize_frecuencies($result); }elseif ($oligo_len=="ZOM"){ $result=ZOM_oligonucleotide_frecuencies($sequence); }elseif($oligo_len=="FOM"){ $result=FOM_oligonucleotide_frecuencies($sequence); }elseif($oligo_len=="SOM"){ $result=SOM_oligonucleotide_frecuencies($sequence); }elseif($oligo_len=="zscore"){ $result=zscores_for_tetranucleotide ($sequence); } // Number of sequences $NoSequences=$greater_than_signs; // Number of nucleotides in all sequences ($greater_than_signs is equal to number of linebreaks, and one linebreak is at the end of each sequence) $sequencelen=strlen($sequence)-$NoSequences; //print out results print "

Number of sequences: $NoSequences"; print "
Total length: $sequencelen bp"; if ($oligo_len=="1" or $oligo_len=="2" or $oligo_len=="3" or $oligo_len=="4" or $oligo_len=="5" or $oligo_len=="6" or $oligo_len=="7" or $oligo_len=="8"){ print "
Frequency of oligos with length $oligo_len
"; }elseif($oligo_len=="1s" or $oligo_len=="2s" or $oligo_len=="3s" or $oligo_len=="4s" or $oligo_len=="5s" or $oligo_len=="6s" or $oligo_len=="7s" or $oligo_len=="8s"){ print "
Standarized frequency of oligos with length $oligo_len
"; }elseif ($oligo_len=="ZOM"){ print "
Zero'th Order Markov chain for tetranucleotides (ZOM)
"; }elseif($oligo_len=="FOM"){ print "
First Order Markov chain for tetranucleotides (FOM)
"; }elseif($oligo_len=="SOM"){ print "
Second Order Markov chain for tetranucleotides (SOM)
"; }elseif($oligo_len=="zscore"){ print "
Z-scores for tetranucleotides
"; } print "\n\n"; } // ###################################################################################################### // ##################################### FUNCTIONS ################################### // ###################################################################################################### // NOTE: The functions bellow to search for frecuencies will work independently. // It is possible to generate functions which are nested to compute oligonucleotide frecuencies // (one function requiring another ones to work) and probably it will be more elegant, // but we wanted to generate copy and paste functions that may be easily used in other scripts. function print_form($sequence,$method,$strand){ if ($method==""){$method="2";} print "
\n"; print "
\n"; print "Copy your sequence in the textarea below\n"; print "
(non coding characters, as number, returns or spaces will be removed)\n"; print "\n"; print "\n
\n"; print "Or upload file (maximum file size: 100 KB txt/plain file): \n"; print "
Ocurrences for oligonucleotides of length:
  "; print "1 "; print "2 "; print "3 "; print "4 "; print "5 "; print "6 "; print "7 "; print "8
"; print "Standarized frequencies for oligonucleotides of length:
  "; print "1 "; print "2 "; print "3 "; print "4 "; print "5 "; print "6 "; print "7 "; print "8
"; print "Other frequencies:
"; print "   Zero'th Order Markov chain for tetranucleotides (ZOM)
"; print "   Fist Order Markov chain for tetranucleotides (FOM)
"; print "   Second Order Markov chain for tetranucleotides (SOM)
"; print "   Z-scores for tetranucleotides

"; print "Compute frequencies in \n"; print "\n"; print "


\n"; print "
This version does not work with degenerated nucleotides\n"; print "
\n"; print "
\n"; } function find_oligos($sequence,$oligo_len){ //for oligos 1 bases long if ($oligo_len==1){ $oligos["A"] = substr_count($sequence,"A"); $oligos["C"] = substr_count($sequence,"C"); $oligos["G"] = substr_count($sequence,"G"); $oligos["T"] = substr_count($sequence,"T"); return $oligos; } //for oligos with at least two bases 2 bases $i=0; $len=strlen($sequence)-$oligo_len+1; while ($i<$len){ $seq=substr($sequence,$i,$oligo_len); $oligos_1step[$seq]++; $i++; } $base=array("A","C","G","T"); //for oligos 2 bases long if ($oligo_len==2){ foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ if ($oligos_1step[$val_a.$val_b]){ $oligos[$val_a.$val_b] = $oligos_1step[$val_a.$val_b]; }else{ $oligos[$val_a.$val_b] = 0; } }} } //for oligos 3 bases long if ($oligo_len==3){ foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ foreach($base as $key_c => $val_c){ if ($oligos_1step[$val_a.$val_b.$val_c]){ $oligos[$val_a.$val_b.$val_c] = $oligos_1step[$val_a.$val_b.$val_c]; }else{ $oligos[$val_a.$val_b.$val_c] = 0; } }}} } //for oligos 4 bases long if ($oligo_len==4){ foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ foreach($base as $key_c => $val_c){ foreach($base as $key_d => $val_d){ if ($oligos_1step[$val_a.$val_b.$val_c.$val_d]){ $oligos[$val_a.$val_b.$val_c.$val_d] = $oligos_1step[$val_a.$val_b.$val_c.$val_d]; }else{ $oligos[$val_a.$val_b.$val_c.$val_d] = 0; } }}}} } //for oligos 5 bases long if ($oligo_len==5){ foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ foreach($base as $key_c => $val_c){ foreach($base as $key_d => $val_d){ foreach($base as $key_e => $val_e){ if ($oligos_1step[$val_a.$val_b.$val_c.$val_d.$val_e]){ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e] = $oligos_1step[$val_a.$val_b.$val_c.$val_d.$val_e]; }else{ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e] = 0; } }}}}} } //for oligos 6 bases long if ($oligo_len==6){ foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ foreach($base as $key_c => $val_c){ foreach($base as $key_d => $val_d){ foreach($base as $key_e => $val_e){ foreach($base as $key_f => $val_f){ if ($oligos_1step[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f]){ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f] = $oligos_1step[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f]; }else{ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f] = 0; } }}}}}} } //for oligos 7 bases long if ($oligo_len==7){ foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ foreach($base as $key_c => $val_c){ foreach($base as $key_d => $val_d){ foreach($base as $key_e => $val_e){ foreach($base as $key_f => $val_f){ foreach($base as $key_g => $val_g){ if ($oligos_1step[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f.$val_g]){ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f.$val_g] = $oligos_1step[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f.$val_g]; }else{ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f.$val_g] = 0; } }}}}}}} } //for oligos 8 bases long if ($oligo_len==8){ foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ foreach($base as $key_c => $val_c){ foreach($base as $key_d => $val_d){ foreach($base as $key_e => $val_e){ foreach($base as $key_f => $val_f){ foreach($base as $key_g => $val_g){ foreach($base as $key_h => $val_h){ if ($oligos_1step[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f.$val_g.$val_h]){ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f.$val_g.$val_h] = $oligos_1step[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f.$val_g.$val_h]; }else{ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f.$val_g.$val_h] = 0; } }}}}}}}} } return $oligos; } function ZOM_oligonucleotide_frecuencies($sequence){ // Zero'th order Markov chain (ZOM) oligonucleotide frecuencies $i=0; $len=strlen($sequence)-4+1; while ($i<$len){ $seq=substr($sequence,$i,4); $oligos_internos4[$seq]++; $i++; } $base_a=array("A","C","G","T"); $base_b=array("A","C","G","T"); $base_c=array("A","C","G","T"); $base_d=array("A","C","G","T"); // mucleotidos $oligos1["A"]=substr_count($sequence,"A"); $oligos1["C"]=substr_count($sequence,"C"); $oligos1["G"]=substr_count($sequence,"G"); $oligos1["T"]=substr_count($sequence,"T"); //para oligos de 4 foreach($base_a as $key_a => $val_a){ foreach($base_b as $key_b => $val_b){ foreach($base_c as $key_c => $val_c){ foreach($base_d as $key_d => $val_d){ if ($oligos_internos4[$val_a.$val_b.$val_c.$val_d]){ $oligos4[$val_a.$val_b.$val_c.$val_d] = $oligos_internos4[$val_a.$val_b.$val_c.$val_d]; }else{ $oligos4[$val_a.$val_b.$val_c.$val_d] = 0; } }}}} // Compute Frecuencies ZOM foreach($base_a as $key_a => $val_a){ foreach($base_b as $key_b => $val_b){ foreach($base_c as $key_c => $val_c){ foreach($base_d as $key_d => $val_d){ $fx[$val_a.$val_b.$val_c.$val_d] = $oligos4[$val_a.$val_b.$val_c.$val_d]/($oligos1[$val_a]*$oligos1[$val_b]*$oligos1[$val_c]*$oligos1[$val_d]); }}}} return $fx; } function FOM_oligonucleotide_frecuencies($sequence){ // First order Markov chain (FOM) oligonucleotide frecuencies $i=0; $len=strlen($sequence); $oligos4=array(); while ($i<$len){ $seq4=substr($sequence,$i,4); $oligos4[$seq4]++; $i++; } $oligos1=array(); $oligos2=array(); foreach($oligos4 as $k => $v){ $seq1=substr($k,0,1); $seq2=substr($k,0,2); $oligos1[$seq1]+=$v; $oligos2[$seq2]+=$v; } $base=array("A","C","G","T"); // Compute Frecuencies FOM foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ $ab=$oligos2[$val_a.$val_b]; foreach($base as $key_c => $val_c){ $bc=$oligos2[$val_b.$val_c]; foreach($base as $key_d => $val_d){ $fx[$val_a.$val_b.$val_c.$val_d] = ($oligos1[$val_b]*$oligos1[$val_c]*$oligos4[$val_a.$val_b.$val_c.$val_d])/($ab*$bc*$oligos2[$val_c.$val_d]); }}}} return $fx; } function SOM_oligonucleotide_frecuencies($sequence){ // Second order Markov chain (SOM) oligonucleotide frecuencies $i=0; $len=strlen($sequence); $oligos4=array(); while ($i<$len){ $s4=substr($sequence,$i,4); $oligos4[$s4]++; $i++; } $oligos3=array(); $oligos2=array(); foreach($oligos4 as $k => $v){ //if (strpos($v," ")>0){continue;} $s3=substr($k,0,3); $s2=substr($k,0,2); $oligos3[$s3]+=$v; $oligos2[$s2]+=$v; } $base=array("A","C","G","T"); // COMPUTE_frecuencies_SOM foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ foreach($base as $key_c => $val_c){ foreach($base as $key_d => $val_d){ $fx[$val_a.$val_b.$val_c.$val_d] = ($oligos4[$val_a.$val_b.$val_c.$val_d]*$oligos2[$val_b.$val_c])/($oligos3[$val_a.$val_b.$val_c]*$oligos3[$val_b.$val_c.$val_d]); }}}} return $fx; } function zscores_for_tetranucleotide($sequence){ // Teeling et al. BMC Bioinformatics 2004; 5:163. $i=0; $len=strlen($sequence); $oligos4=array(); while ($i<$len){ $seq4=substr($sequence,$i,4); $oligos4[$seq4]++; $i++; } $oligos3=array(); $oligos2=array(); foreach($oligos4 as $k => $v){ $seq3=substr($k,0,3); $seq2=substr($k,0,2); $oligos3[$seq3]+=$v; $oligos2[$seq2]+=$v; } $base=$base_b=$base_c=$base_d=array("A","C","G","T"); // SACAR Z-SCORES foreach($base as $key_a => $val_a){ foreach($base as $key_b => $val_b){ foreach($base as $key_c => $val_c){ foreach($base as $key_d => $val_d){ $exp = ($oligos3[$val_a.$val_b.$val_c]*$oligos3[$val_b.$val_c.$val_d])/$oligos2[$val_b.$val_c]; $var = $exp *((($oligos2[$val_b.$val_c]-$oligos3[$val_a.$val_b.$val_c])*($oligos2[$val_b.$val_c]-$oligos3[$val_b.$val_c.$val_d]))/($oligos2[$val_b.$val_c]*$oligos2[$val_b.$val_c])); $zscore[$val_a.$val_b.$val_c.$val_d] = ($oligos4[$val_a.$val_b.$val_c.$val_d]-$exp)/sqrt($var); }}}} return $zscore; } function standarize_frecuencies($array){ $sum=0; foreach($array as $k => $v){ $sum+=$v; } $c=sizeof($array)/$sum; foreach($array as $k => $v){ $array[$k]= $c*$v; } return $array; } // computes the reverse complement of a sequence (only ACGT nucleotides are used) // the method used may be considered very simple, but more elegant method are less efficient function RevComp($seq){ $seq=strrev($seq); $seq=str_replace("A", "t", $seq); $seq=str_replace("T", "a", $seq); $seq=str_replace("G", "c", $seq); $seq=str_replace("C", "g", $seq); $seq = strtoupper ($seq); return $seq; } ?>