DNA Sequence Alignment

From Biolecture.org

% <<<MATLAB code I created for DNA Sequence Alignment.>>>

clear; clc;
% Gene Sequence Alignment 
% 20141581_Kyunghyun Cho

fprintf('Type the sequqence of reference gene in '); cprintf('_red', 'line 10\n\n');
fprintf('Type the sequqence of your sample gene in '); cprintf('_red', 'line 13\n\n');
fprintf('Set the range of quality you are focusing in '); cprintf('_red', 'BOTH line 69, 185\n\n\n\n\n');

% <<<Type the sequence of "reference gene" here!>>>
ref = nt2int('ATGTTTGGCA'); 

% <<<Type the sequence of your "sample gene" here!>>>
sam = nt2int('ATGTC'); 


% Alignment Code - Do not edit!
[a,ref_length] = size(ref); % Basically, a=1, and the length of reference gene sequence is "ref_length"
[b,sam_length] = size(sam); % Basically, b=1, and the length of sample gene sequence is "sam_length"

% I considered the cases which the lengths of them are not identical.
% I needed some bases that mean nothing(i,e. just gap) in order to make sequences to have same length.
% Then, it is able to align two sequences that have different length.

if ref_length<sam_length
    
    length_difference = sam_length - ref_length;
    nobase = zeros(1,length_difference); % Basically, 0 means gap in sequence, so I created (1Xlength_differenece) matrix with zero.
    
    mod_ref = horzcat(ref,nobase); % Using function "horzcat" to concatenate ref and nobase marix.
    % For instance, if ref = [2,3,4] and sam = [1,2,2,3,4], then the length_difference is 2. Thus nobase = [0,0] and mod_ref = [2,3,4,0,0]. Now, size is same, then it is able to align!
    mod_sam = sam;

    % Now, then the question is how to align the sequences.
    % I used a trick using "for" function.
    
    % Using the first "for" to move the frame of "mod_sam "sequences to consider all possibility of alignment.
    % Simply, for example, suppose that our sequences are mod_ref = [2,3,4,0,0] and mod_sam = [1,2,2,3,4].
    % If just align them without consideration of possibility of frame moving, they are not matching at all. Because mod_ref(1) ~= mod_sam(1), mod_ref(2) ~= mod_sam(2) and so on.
    % However, what if the frame of mod_sam moved to like [2,3,4,1,2]. Then, three components are matching! Therefore, I considered all possible frame movement of "mod_sam".
    
    % Using the second "for" to do alignment and to calculate the quality of the alignment.
    % Simply, I created another function "aligned" which composed of all 1 or 0 values.
    % I compared two sequences whether they are matcing at certain point from begining(5') to end(3'). 1 means two sequences are matching at that point. 0 means not matcing. 
    % For example, if mod_ref = [1,2,1,4,0] and move_sam = [1,1,3,4,2], then aligned = [1,0,0,1,0].
    
    % The alignment quality I defined would be summation of all components of matrix "aligned" divide by "sam_length" multiply by 100(%).
    % I created "result" matrix that collects all possible alignment quaility during frame moving.
    % In "result", the highest value is "highest_q" which means the highest quality. And using "find" function to find where the highest_q is located in "result".

    % Using the third "for" to get most possible alignment result which has the highest quality.
    for i = 1:sam_length
        move_sam = circshift(mod_sam, i-1); 
        
        for j = 1:sam_length
            if mod_ref(j) == move_sam(j)
                aligned(j) = 1;
            else
                aligned(j) = 0;
            end
            quality = sum(aligned)/sam_length;
        end
        quality_family(i) = quality;
        
        
        
        % <<<Set the range of quality you are intersted in!>>> 
        X = find(quality_family >= 0.2);
        
        
        
        
        % Code for displaying alignment result (Aligned sequences in the range) - Do not edit!
        Number_quality_in_range = numel(X);
        
        for x = 1:Number_quality_in_range
            pos_sam_in_range(Number_quality_in_range,:) = circshift(sam, X(1,Number_quality_in_range)-1);
            
            for y = 1:sam_length
                if mod_ref(y) == pos_sam_in_range(x,y)
                pos_aligned(x,y) = 1;
                else
                pos_aligned(x,y) = 0;
                end  
            end
            
            for z = 1:sam_length
                    if pos_aligned(x,z) == 0 
                        aligned_ref.name = int2nt(mod_ref(z));
                        aligned_sam.name = int2nt(pos_sam_in_range(x,z));
                        aligned_ref.Color = [255 0 0]; 
                        aligned_sam.Color = [255 0 0];
                        pre1 = '<HTML><FONT color="';
                        post1 = '</FONT></HTML>';
                        listboxStr_ref1 = cell(numel( aligned_ref ),1);
                        listboxStr_sam1 = cell(numel( aligned_sam ),1);
                        str_ref1 = [pre1 reshape( dec2hex(aligned_ref.Color, 2)',1, 6) '">' aligned_ref.name post1];
                        listboxStr_ref1 = str_ref1;
                        Show_aligned_ref = {listboxStr_ref1};
                        str_sam1 = [pre1 reshape( dec2hex(aligned_sam.Color, 2)',1, 6) '">' aligned_sam.name post1];
                        listboxStr_sam1 = str_sam1;
                        Show_aligned_sam = {listboxStr_sam1};
                        
                    else
                        Show_aligned_ref = int2nt(mod_ref(z));
                        Show_aligned_ref = cellstr(Show_aligned_ref); 
                        Show_aligned_sam = int2nt(pos_sam_in_range(x,z));
                        Show_aligned_sam = cellstr(Show_aligned_sam);
                    end
                    Result_in_range(2*x-1,z) = Show_aligned_ref;
                    Result_in_range(2*x,z) = Show_aligned_sam; 
                end 
        end
            
        % Find highest quality - Do not edit!
        highest_q = max(quality_family); 
        [l, location_highest_q] = find(quality_family==highest_q); % Basically, l=1, so focusing on location_highest_q only.

        most_pos_sam = circshift(sam, location_highest_q-1); % Find most possible sequence of sample.
        
        for k = 1:sam_length
            if mod_ref(k) == most_pos_sam(k)
                most_aligned(k) = 1;
            else
                most_aligned(k) = 0;
            end
        end
    end
    
    % Code for displaying alignment result (Aligned sequences that have highest quality) - Do not edit!
    for m =1:sam_length
        
        if most_aligned(m) == 0 % If two sequences are different at same position, display sequences as red colour using "rgb2Hex" function.
            most_aligned_ref.name = int2nt(mod_ref(m));
            most_aligned_sam.name = int2nt(most_pos_sam(m));
            most_aligned_ref.Color = [255 0 0]; 
            most_aligned_sam.Color = [255 0 0];
            pre = '<HTML><FONT color="';
            post = '</FONT></HTML>';
            listboxStr_ref = cell(numel( most_aligned_ref ),1);
            listboxStr_sam = cell(numel( most_aligned_sam ),1);
            
            str_ref = [pre reshape( dec2hex(most_aligned_ref.Color, 2)',1, 6) '">' most_aligned_ref.name post];
            listboxStr_ref = str_ref;
            Show_most_aligned_ref = {listboxStr_ref};
            str_sam = [pre reshape( dec2hex(most_aligned_sam.Color, 2)',1, 6) '">' most_aligned_sam.name post];
            listboxStr_sam = str_sam;
            Show_most_aligned_sam = {listboxStr_sam};
            
        else
            Show_most_aligned_ref = int2nt(mod_ref(m));
            Show_most_aligned_ref = cellstr(Show_most_aligned_ref);
            Show_most_aligned_sam = int2nt(most_pos_sam(m));
            Show_most_aligned_sam = cellstr(Show_most_aligned_sam);
        end
        
    Result_highest_quality(1,m) = Show_most_aligned_ref;
    Result_highest_quality(2,m) = Show_most_aligned_sam;
    
    end    
    
    
% In the same way, but consider the case of same length or larger reference length.
else
    
    length_difference = ref_length - sam_length;
    nobase = zeros(1,length_difference);
    mod_ref = ref;
    mod_sam = horzcat(sam,nobase);
    
    for i = 1:ref_length
    move_sam = circshift(mod_sam, i-1); 
        for j = 1:ref_length
            if mod_ref(j) == move_sam(j)
                aligned(j) = 1; 
            else
                aligned(j) = 0; 
            end
        end
        quality = sum(aligned)/ref_length;
        quality_family(i) = quality; 
        
        % <<<Set the range of quality you are intersted in!>>> 
        X = find(quality_family >= 0.2);
        
        
        
        % Code for displaying alignment result (Aligned sequences in the range) - Do not edit!
        Number_quality_in_range = numel(X);

        
        for x = 1:Number_quality_in_range
            pos_sam_in_range(Number_quality_in_range,:) = circshift(mod_sam, X(1,Number_quality_in_range)-1);
            
            for y = 1:ref_length
                if mod_ref(y) == pos_sam_in_range(x,y)
                pos_aligned(x,y) = 1;
                else
                pos_aligned(x,y) = 0;
                end  
            end
            
            for z = 1:ref_length
                    if pos_aligned(x,z) == 0 
                        aligned_ref.name = int2nt(mod_ref(z));
                        aligned_sam.name = int2nt(pos_sam_in_range(x,z));
                        aligned_ref.Color = [255 0 0]; 
                        aligned_sam.Color = [255 0 0];
                        pre1 = '<HTML><FONT color="';
                        post1 = '</FONT></HTML>';
                        listboxStr_ref1 = cell(numel( aligned_ref ),1);
                        listboxStr_sam1 = cell(numel( aligned_sam ),1);
                        str_ref1 = [pre1 reshape( dec2hex(aligned_ref.Color, 2)',1, 6) '">' aligned_ref.name post1];
                        listboxStr_ref1 = str_ref1;
                        Show_aligned_ref = {listboxStr_ref1};
                        str_sam1 = [pre1 reshape( dec2hex(aligned_sam.Color, 2)',1, 6) '">' aligned_sam.name post1];
                        listboxStr_sam1 = str_sam1;
                        Show_aligned_sam = {listboxStr_sam1};
                        
                    else
                        Show_aligned_ref = int2nt(mod_ref(z));
                        Show_aligned_ref = cellstr(Show_aligned_ref); 
                        Show_aligned_sam = int2nt(pos_sam_in_range(x,z));
                        Show_aligned_sam = cellstr(Show_aligned_sam);
                    end
                    Result_in_range(2*x-1,z) = Show_aligned_ref;
                    Result_in_range(2*x,z) = Show_aligned_sam;
                end 
        end
            

        highest_q = max(quality_family);
        [l, location_highest_q] = find(quality_family==highest_q); 

        most_pos_sam = circshift(mod_sam, location_highest_q-1);
        
        for k = 1:ref_length
            if mod_ref(k) == most_pos_sam(k)
                most_aligned(k) = 1; 
            else
                most_aligned(k) = 0; 
            end
        end
    end
% Code for displaying alignment result (Aligned sequences that have highest quality) - Do not edit!
    for n =1:ref_length
        
        if most_aligned(n) == 0
            most_aligned_ref.name = int2nt(mod_ref(n));
            most_aligned_sam.name = int2nt(most_pos_sam(n));
            most_aligned_ref.Color = [255 0 0];
            most_aligned_sam.Color = [255 0 0];
            pre = '<HTML><FONT color="';
            post = '</FONT></HTML>';
            listboxStr_ref = cell(numel( most_aligned_ref ),1);
            listboxStr_sam = cell(numel( most_aligned_sam ),1);
            
            str_ref = [pre reshape( dec2hex(most_aligned_ref.Color, 2)',1, 6) '">' most_aligned_ref.name post];
            listboxStr_ref = str_ref;
            Show_most_aligned_ref = {listboxStr_ref};
            str_sam = [pre reshape( dec2hex(most_aligned_sam.Color, 2)',1, 6) '">' most_aligned_sam.name post];
            listboxStr_sam = str_sam;
            Show_most_aligned_sam = {listboxStr_sam};
            
        else
            Show_most_aligned_ref = int2nt(mod_ref(n));
            Show_most_aligned_ref = cellstr(Show_most_aligned_ref);
            Show_most_aligned_sam = int2nt(most_pos_sam(n));
            Show_most_aligned_sam = cellstr(Show_most_aligned_sam);
        end
        
    Result_highest_quality(1,n) = Show_most_aligned_ref;
    Result_highest_quality(2,n) = Show_most_aligned_sam;
    
    end    
end

% Displaying qualities - Do not Edit!
fprintf('The qualities '); cprintf('_red', 'in the range'); fprintf(' are'); disp(quality_family(X));

fprintf('The cells '); cprintf('_red', '"Result_in_range"'); fprintf(' showing the possible alignment matching in the range of quality you had set. \n\n\n\n');

fprintf('The '); cprintf('_red', 'highest'); fprintf(' quality is'); disp(highest_q);

fprintf('The cells '); cprintf('_red', '"Result_highest_quality"'); fprintf(' showing the most possible alignment matching between refenrence gene and your sample gene. \n\n');