Difference between revisions of "Homework 6. In a multi-sequence FASTA file, produce statistics such as sequence number, average seq length, GC content, AT content, etc"
imported>YoungKwang Jung (Created page with "<p> </p> <p> </p> <p> </p> <p> </p> <p> </p> <p> </p> <p> </p> <hr /> <p>Homework_in_the_bioinformatics_class</p>") |
imported>YoungKwang Jung |
||
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | <p>It is Myeongji`s idea</p> | ||
+ | |||
<p> </p> | <p> </p> | ||
+ | |||
+ | <p>#!/usr/bin/perl<br /> | ||
+ | use strict;<br /> | ||
+ | use warnings;</p> | ||
+ | |||
+ | <p>my$DNA1="TRET.txt";<br /> | ||
+ | open(fileHandle, $DNA1);<br /> | ||
+ | my@allLines1=<fileHandle>;<br /> | ||
+ | chomp(@allLines1);<br /> | ||
+ | my@protein1 = splice @allLines1, 1;<br /> | ||
+ | close(fileHandle);</p> | ||
+ | |||
+ | <p>my$DNA2="VEGFA.txt";<br /> | ||
+ | open(fileHandle, $DNA2);<br /> | ||
+ | my@allLines2=<fileHandle>;<br /> | ||
+ | chomp(@allLines2);<br /> | ||
+ | my@protein2 = splice @allLines2, 1;<br /> | ||
+ | close(fileHandle);</p> | ||
+ | |||
+ | <p>my$DNA3="Propreinsulin.txt";<br /> | ||
+ | open(fileHandle, $DNA3);<br /> | ||
+ | my@allLines3=<fileHandle>;<br /> | ||
+ | chomp(@allLines3);<br /> | ||
+ | my@protein3 = splice @allLines3, 1;<br /> | ||
+ | close(fileHandle);</p> | ||
+ | |||
+ | <p>my$pro1 = join "",@protein1;<br /> | ||
+ | print "TRET : ", $pro1, "\n";<br /> | ||
+ | my$pro2 = join "",@protein2;<br /> | ||
+ | print "VEGFA : ", $pro2, "\n";<br /> | ||
+ | my$pro3 = join "",@protein3;<br /> | ||
+ | print "Propreinsulin : ", $pro3, "\n";</p> | ||
+ | |||
+ | <p>print "\n";</p> | ||
+ | |||
+ | <p>my$length1=length$pro1;<br /> | ||
+ | print "The length of TRET sequence : $length1.\n";<br /> | ||
+ | my$length2=length$pro2;<br /> | ||
+ | print "The length of VEGFA sequence : $length2.\n";<br /> | ||
+ | my$length3=length$pro3;<br /> | ||
+ | print "The length of Propreinsulin sequence : $length3.\n";</p> | ||
+ | |||
+ | <p>my$average=($length1+$length2+$length3)/3;<br /> | ||
+ | print "The average length of multi-sequences : $average.\n";</p> | ||
+ | |||
+ | <p>print "\n";</p> | ||
+ | |||
+ | <p># =~tr/AT/BC/ : A를 B로, T를 C로 교환 (1:1)<br /> | ||
+ | my $i;<br /> | ||
+ | for($i=1; $i<=3; $i++)<br /> | ||
+ | {<br /> | ||
+ | if($i == 1)<br /> | ||
+ | {<br /> | ||
+ | my$pro = $pro1;<br /> | ||
+ | my$a=($pro=~tr/A/A/);<br /> | ||
+ | my$b=($pro=~tr/C/C/);<br /> | ||
+ | my$c=($pro=~tr/G/G/);<br /> | ||
+ | my$d=($pro=~tr/T/T/);<br /> | ||
+ | my$Total=$a+$b+$c+$d;<br /> | ||
+ | print"<protein", $i, ">\n";<br /> | ||
+ | print"The number of A is $a.\n";<br /> | ||
+ | print"The number of C is $b.\n";<br /> | ||
+ | print"The number of G is $c.\n";<br /> | ||
+ | print"The number of T is $d.\n";<br /> | ||
+ | print"Total bases in DNA : $Total.\n";</p> | ||
+ | |||
+ | <p> #count of GC<br /> | ||
+ | #=~s:search, /이걸/이걸로/ 치환 (1:1아님), g:글로벌 매칭(스트링 전체를 훑고 매칭하는 것), 치환한 갯수를 결과로 나타냄<br /> | ||
+ | my$GC=($pro=~s/GC/GC/g); <br /> | ||
+ | print"The total number of dinucleotide GC in DNA : $GC.\n";<br /> | ||
+ | <br /> | ||
+ | #count of AT<br /> | ||
+ | my$AT=($pro=~s/AT/AT/g);<br /> | ||
+ | print"The total number of dinucleotide AT in DNA : $AT.\n";<br /> | ||
+ | <br /> | ||
+ | #percentage of GC & AT<br /> | ||
+ | my$GCper=($GC/($Total)*100);<br /> | ||
+ | print"The percentage of GC: $GCper%.\n";<br /> | ||
+ | my$ATper=($AT/($Total)*100);<br /> | ||
+ | print"The percentage of AT: $ATper%.\n";<br /> | ||
+ | print "\n";<br /> | ||
+ | };<br /> | ||
+ | if($i == 2)<br /> | ||
+ | {<br /> | ||
+ | my$pro = $pro2;<br /> | ||
+ | my$a=($pro=~tr/A/A/);<br /> | ||
+ | my$b=($pro=~tr/C/C/);<br /> | ||
+ | my$c=($pro=~tr/G/G/);<br /> | ||
+ | my$d=($pro=~tr/T/T/);<br /> | ||
+ | my$Total=$a+$b+$c+$d;<br /> | ||
+ | print"<protein", $i, ">\n";<br /> | ||
+ | print"The number of A is $a.\n";<br /> | ||
+ | print"The number of C is $b.\n";<br /> | ||
+ | print"The number of G is $c.\n";<br /> | ||
+ | print"The number of T is $d.\n";<br /> | ||
+ | print"Total bases in DNA : $Total.\n";</p> | ||
+ | |||
+ | <p> #count of GC<br /> | ||
+ | #=~s:search, /이걸/이걸로/ 치환 (1:1아님), g:글로벌 매칭(스트링 전체를 훑고 매칭하는 것), 치환한 갯수를 결과로 나타냄<br /> | ||
+ | my$GC=($pro=~s/GC/GC/g); <br /> | ||
+ | print"The total number of dinucleotide GC in DNA : $GC.\n";<br /> | ||
+ | <br /> | ||
+ | #count of AT<br /> | ||
+ | my$AT=($pro=~s/AT/AT/g);<br /> | ||
+ | print"The total number of dinucleotide AT in DNA : $AT.\n";<br /> | ||
+ | <br /> | ||
+ | #percentage of GC & AT<br /> | ||
+ | my$GCper=($GC/($Total)*100);<br /> | ||
+ | print"The percentage of GC: $GCper%.\n";<br /> | ||
+ | my$ATper=($AT/($Total)*100);<br /> | ||
+ | print"The percentage of AT: $ATper%.\n";<br /> | ||
+ | print "\n";<br /> | ||
+ | };<br /> | ||
+ | if($i == 3)<br /> | ||
+ | {<br /> | ||
+ | my$pro = $pro3;<br /> | ||
+ | my$a=($pro=~tr/A/A/);<br /> | ||
+ | my$b=($pro=~tr/C/C/);<br /> | ||
+ | my$c=($pro=~tr/G/G/);<br /> | ||
+ | my$d=($pro=~tr/T/T/);<br /> | ||
+ | my$Total=$a+$b+$c+$d;<br /> | ||
+ | print"<protein", $i, ">\n";<br /> | ||
+ | print"The number of A is $a.\n";<br /> | ||
+ | print"The number of C is $b.\n";<br /> | ||
+ | print"The number of G is $c.\n";<br /> | ||
+ | print"The number of T is $d.\n";<br /> | ||
+ | print"Total bases in DNA : $Total.\n";</p> | ||
+ | |||
+ | <p> #count of GC<br /> | ||
+ | #=~s:search, /이걸/이걸로/ 치환 (1:1아님), g:글로벌 매칭(스트링 전체를 훑고 매칭하는 것), 치환한 갯수를 결과로 나타냄<br /> | ||
+ | my$GC=($pro=~s/GC/GC/g); <br /> | ||
+ | print"The total number of dinucleotide GC in DNA : $GC.\n";<br /> | ||
+ | <br /> | ||
+ | #count of AT<br /> | ||
+ | my$AT=($pro=~s/AT/AT/g);<br /> | ||
+ | print"The total number of dinucleotide AT in DNA : $AT.\n";<br /> | ||
+ | <br /> | ||
+ | #percentage of GC & AT<br /> | ||
+ | my$GCper=($GC/($Total)*100);<br /> | ||
+ | print"The percentage of GC: $GCper%.\n";<br /> | ||
+ | my$ATper=($AT/($Total)*100);<br /> | ||
+ | print"The percentage of AT: $ATper%.\n";<br /> | ||
+ | print "\n";<br /> | ||
+ | };</p> | ||
+ | |||
+ | <p>};</p> | ||
+ | |||
+ | <p>exit;</p> | ||
+ | |||
+ | <p>my $file = <FASTA>;<br /> | ||
+ | my @R = split "",$file; <br /> | ||
+ | my $A =0;<br /> | ||
+ | foreach my $nuc (@R) {<br /> | ||
+ | if ($nuc eq 'A') {<br /> | ||
+ | $A = $A+1;<br /> | ||
+ | }<br /> | ||
+ | }</p> | ||
<p> </p> | <p> </p> |
Latest revision as of 19:29, 27 May 2017
It is Myeongji`s idea
#!/usr/bin/perl
use strict;
use warnings;
my$DNA1="TRET.txt";
open(fileHandle, $DNA1);
my@allLines1=<fileHandle>;
chomp(@allLines1);
my@protein1 = splice @allLines1, 1;
close(fileHandle);
my$DNA2="VEGFA.txt";
open(fileHandle, $DNA2);
my@allLines2=<fileHandle>;
chomp(@allLines2);
my@protein2 = splice @allLines2, 1;
close(fileHandle);
my$DNA3="Propreinsulin.txt";
open(fileHandle, $DNA3);
my@allLines3=<fileHandle>;
chomp(@allLines3);
my@protein3 = splice @allLines3, 1;
close(fileHandle);
my$pro1 = join "",@protein1;
print "TRET : ", $pro1, "\n";
my$pro2 = join "",@protein2;
print "VEGFA : ", $pro2, "\n";
my$pro3 = join "",@protein3;
print "Propreinsulin : ", $pro3, "\n";
print "\n";
my$length1=length$pro1;
print "The length of TRET sequence : $length1.\n";
my$length2=length$pro2;
print "The length of VEGFA sequence : $length2.\n";
my$length3=length$pro3;
print "The length of Propreinsulin sequence : $length3.\n";
my$average=($length1+$length2+$length3)/3;
print "The average length of multi-sequences : $average.\n";
print "\n";
# =~tr/AT/BC/ : A를 B로, T를 C로 교환 (1:1)
my $i;
for($i=1; $i<=3; $i++)
{
if($i == 1)
{
my$pro = $pro1;
my$a=($pro=~tr/A/A/);
my$b=($pro=~tr/C/C/);
my$c=($pro=~tr/G/G/);
my$d=($pro=~tr/T/T/);
my$Total=$a+$b+$c+$d;
print"<protein", $i, ">\n";
print"The number of A is $a.\n";
print"The number of C is $b.\n";
print"The number of G is $c.\n";
print"The number of T is $d.\n";
print"Total bases in DNA : $Total.\n";
#count of GC
#=~s:search, /이걸/이걸로/ 치환 (1:1아님), g:글로벌 매칭(스트링 전체를 훑고 매칭하는 것), 치환한 갯수를 결과로 나타냄
my$GC=($pro=~s/GC/GC/g);
print"The total number of dinucleotide GC in DNA : $GC.\n";
#count of AT
my$AT=($pro=~s/AT/AT/g);
print"The total number of dinucleotide AT in DNA : $AT.\n";
#percentage of GC & AT
my$GCper=($GC/($Total)*100);
print"The percentage of GC: $GCper%.\n";
my$ATper=($AT/($Total)*100);
print"The percentage of AT: $ATper%.\n";
print "\n";
};
if($i == 2)
{
my$pro = $pro2;
my$a=($pro=~tr/A/A/);
my$b=($pro=~tr/C/C/);
my$c=($pro=~tr/G/G/);
my$d=($pro=~tr/T/T/);
my$Total=$a+$b+$c+$d;
print"<protein", $i, ">\n";
print"The number of A is $a.\n";
print"The number of C is $b.\n";
print"The number of G is $c.\n";
print"The number of T is $d.\n";
print"Total bases in DNA : $Total.\n";
#count of GC
#=~s:search, /이걸/이걸로/ 치환 (1:1아님), g:글로벌 매칭(스트링 전체를 훑고 매칭하는 것), 치환한 갯수를 결과로 나타냄
my$GC=($pro=~s/GC/GC/g);
print"The total number of dinucleotide GC in DNA : $GC.\n";
#count of AT
my$AT=($pro=~s/AT/AT/g);
print"The total number of dinucleotide AT in DNA : $AT.\n";
#percentage of GC & AT
my$GCper=($GC/($Total)*100);
print"The percentage of GC: $GCper%.\n";
my$ATper=($AT/($Total)*100);
print"The percentage of AT: $ATper%.\n";
print "\n";
};
if($i == 3)
{
my$pro = $pro3;
my$a=($pro=~tr/A/A/);
my$b=($pro=~tr/C/C/);
my$c=($pro=~tr/G/G/);
my$d=($pro=~tr/T/T/);
my$Total=$a+$b+$c+$d;
print"<protein", $i, ">\n";
print"The number of A is $a.\n";
print"The number of C is $b.\n";
print"The number of G is $c.\n";
print"The number of T is $d.\n";
print"Total bases in DNA : $Total.\n";
#count of GC
#=~s:search, /이걸/이걸로/ 치환 (1:1아님), g:글로벌 매칭(스트링 전체를 훑고 매칭하는 것), 치환한 갯수를 결과로 나타냄
my$GC=($pro=~s/GC/GC/g);
print"The total number of dinucleotide GC in DNA : $GC.\n";
#count of AT
my$AT=($pro=~s/AT/AT/g);
print"The total number of dinucleotide AT in DNA : $AT.\n";
#percentage of GC & AT
my$GCper=($GC/($Total)*100);
print"The percentage of GC: $GCper%.\n";
my$ATper=($AT/($Total)*100);
print"The percentage of AT: $ATper%.\n";
print "\n";
};
};
exit;
my $file = <FASTA>;
my @R = split "",$file;
my $A =0;
foreach my $nuc (@R) {
if ($nuc eq 'A') {
$A = $A+1;
}
}