#!/usr/bin/perl ################################################ # count_nucleotides.pl # Version 1 # Robert D. Cormia # UCSC Programming for Bioinformatics II # October 10, 2003 ################################################ # 2. In this exercise we will work with the concept of counting nucleotides. # Write a Perl program called 'count_nucleotides' to count the nucleotides # in a data file which looks like this (found in sample.nucleotides): # # ACTGCCACCAGGTCCTCGCGGTAGTCA # GACGGTCAGCCGCGTTTATCGTCCTGA # CCTCCGCGTAGGACGTCGTTGTTCTAG # # Print the sequence in each line followed by the counts. # Print the total number of nucleotides, in this fashion: # # Seq is: ACTGCCACCAGGTCCTCGCGGTAGTCA # A:5 C:10 G:7 T:5 # … # … # Total Counts => A:12 C:26 G:23 T:20 # ---------------------------------------------------------------- # # Pseudocode # # Open file # Read file (line by line) # For each line, read characters into an array # Run 'count_nuceotides()' subroutine against data # Store the results of ATCG count for each string # Print the results of ATCG count for each string # # ----------------------------------------------------------------- use strict; my $line; my @lines; my @seq; my %count; my %tcount; my $nuc; print "This program will count nucleotides\n"; # Nucleotide counting code open (INPUT, "sample_nucleotides.txt") or die "Cannot open file: $!"; @lines = ; close INPUT; print "Got all of the sequence\n"; # Input lines of sequence into array lines. chomp @lines; #$line = uc (join '', @lines); foreach (@lines) { # Remove invalid characters $_ =~ s/[^ATCGNX]//g; # test for possible blank line and skip if true if ($_ eq "") { next; } # Separate nucleotide line into character array @seq = split //, $_; # Increment a count for each element in the sequence foreach ( @seq ) { $count{$_}++ } # print out the reslts for each line print "\n"; print "Seq is: $_\n"; print "Nucleotide count: ", scalar(@seq), " bases\n"; print '-'x60, "\n"; foreach $nuc ( sort keys %count ) { printf "\t$nuc:\t%4d\t(%.1f%%)\n", $count{$nuc}, 100*$count{$nuc}/@seq; $tcount{$nuc} += $count{$nuc}; $count{$nuc} = 0; } } # print the totals # print "\n\nTOTAL Nuclotides ??????\n"; # foreach $nuc ( sort keys %tcount ) { # printf "\t$nuc:\t%4d\t(%.1f%%)\n", $tcount{$nuc}, 100*$tcount{$nuc}/@seq; # } __END__