DNA Sequence Alignment
% <<<MATLAB code I created for DNA Sequence Alignment.>>>
clear; clc;
% Gene Sequence Alignment
% 20141581_Kyunghyun Cho
fprintf('Type the sequqence of reference gene in '); cprintf('_red', 'line 10\n\n');
fprintf('Type the sequqence of your sample gene in '); cprintf('_red', 'line 13\n\n');
fprintf('Set the range of quality you are focusing in '); cprintf('_red', 'BOTH line 69, 185\n\n\n\n\n');
% <<<Type the sequence of "reference gene" here!>>>
ref = nt2int('ATGTTTGGCA');
% <<<Type the sequence of your "sample gene" here!>>>
sam = nt2int('ATGTC');
% Alignment Code - Do not edit!
[a,ref_length] = size(ref); % Basically, a=1, and the length of reference gene sequence is "ref_length"
[b,sam_length] = size(sam); % Basically, b=1, and the length of sample gene sequence is "sam_length"
% I considered the cases which the lengths of them are not identical.
% I needed some bases that mean nothing(i,e. just gap) in order to make sequences to have same length.
% Then, it is able to align two sequences that have different length.
if ref_length<sam_length
length_difference = sam_length - ref_length;
nobase = zeros(1,length_difference); % Basically, 0 means gap in sequence, so I created (1Xlength_differenece) matrix with zero.
mod_ref = horzcat(ref,nobase); % Using function "horzcat" to concatenate ref and nobase marix.
% For instance, if ref = [2,3,4] and sam = [1,2,2,3,4], then the length_difference is 2. Thus nobase = [0,0] and mod_ref = [2,3,4,0,0]. Now, size is same, then it is able to align!
mod_sam = sam;
% Now, then the question is how to align the sequences.
% I used a trick using "for" function.
% Using the first "for" to move the frame of "mod_sam "sequences to consider all possibility of alignment.
% Simply, for example, suppose that our sequences are mod_ref = [2,3,4,0,0] and mod_sam = [1,2,2,3,4].
% If just align them without consideration of possibility of frame moving, they are not matching at all. Because mod_ref(1) ~= mod_sam(1), mod_ref(2) ~= mod_sam(2) and so on.
% However, what if the frame of mod_sam moved to like [2,3,4,1,2]. Then, three components are matching! Therefore, I considered all possible frame movement of "mod_sam".
% Using the second "for" to do alignment and to calculate the quality of the alignment.
% Simply, I created another function "aligned" which composed of all 1 or 0 values.
% I compared two sequences whether they are matcing at certain point from begining(5') to end(3'). 1 means two sequences are matching at that point. 0 means not matcing.
% For example, if mod_ref = [1,2,1,4,0] and move_sam = [1,1,3,4,2], then aligned = [1,0,0,1,0].
% The alignment quality I defined would be summation of all components of matrix "aligned" divide by "sam_length" multiply by 100(%).
% I created "result" matrix that collects all possible alignment quaility during frame moving.
% In "result", the highest value is "highest_q" which means the highest quality. And using "find" function to find where the highest_q is located in "result".
% Using the third "for" to get most possible alignment result which has the highest quality.
for i = 1:sam_length
move_sam = circshift(mod_sam, i-1);
for j = 1:sam_length
if mod_ref(j) == move_sam(j)
aligned(j) = 1;
else
aligned(j) = 0;
end
quality = sum(aligned)/sam_length;
end
quality_family(i) = quality;
% <<<Set the range of quality you are intersted in!>>>
X = find(quality_family >= 0.2);
% Code for displaying alignment result (Aligned sequences in the range) - Do not edit!
Number_quality_in_range = numel(X);
for x = 1:Number_quality_in_range
pos_sam_in_range(Number_quality_in_range,:) = circshift(sam, X(1,Number_quality_in_range)-1);
for y = 1:sam_length
if mod_ref(y) == pos_sam_in_range(x,y)
pos_aligned(x,y) = 1;
else
pos_aligned(x,y) = 0;
end
end
for z = 1:sam_length
if pos_aligned(x,z) == 0
aligned_ref.name = int2nt(mod_ref(z));
aligned_sam.name = int2nt(pos_sam_in_range(x,z));
aligned_ref.Color = [255 0 0];
aligned_sam.Color = [255 0 0];
pre1 = '<HTML><FONT color="';
post1 = '</FONT></HTML>';
listboxStr_ref1 = cell(numel( aligned_ref ),1);
listboxStr_sam1 = cell(numel( aligned_sam ),1);
str_ref1 = [pre1 reshape( dec2hex(aligned_ref.Color, 2)',1, 6) '">' aligned_ref.name post1];
listboxStr_ref1 = str_ref1;
Show_aligned_ref = {listboxStr_ref1};
str_sam1 = [pre1 reshape( dec2hex(aligned_sam.Color, 2)',1, 6) '">' aligned_sam.name post1];
listboxStr_sam1 = str_sam1;
Show_aligned_sam = {listboxStr_sam1};
else
Show_aligned_ref = int2nt(mod_ref(z));
Show_aligned_ref = cellstr(Show_aligned_ref);
Show_aligned_sam = int2nt(pos_sam_in_range(x,z));
Show_aligned_sam = cellstr(Show_aligned_sam);
end
Result_in_range(2*x-1,z) = Show_aligned_ref;
Result_in_range(2*x,z) = Show_aligned_sam;
end
end
% Find highest quality - Do not edit!
highest_q = max(quality_family);
[l, location_highest_q] = find(quality_family==highest_q); % Basically, l=1, so focusing on location_highest_q only.
most_pos_sam = circshift(sam, location_highest_q-1); % Find most possible sequence of sample.
for k = 1:sam_length
if mod_ref(k) == most_pos_sam(k)
most_aligned(k) = 1;
else
most_aligned(k) = 0;
end
end
end
% Code for displaying alignment result (Aligned sequences that have highest quality) - Do not edit!
for m =1:sam_length
if most_aligned(m) == 0 % If two sequences are different at same position, display sequences as red colour using "rgb2Hex" function.
most_aligned_ref.name = int2nt(mod_ref(m));
most_aligned_sam.name = int2nt(most_pos_sam(m));
most_aligned_ref.Color = [255 0 0];
most_aligned_sam.Color = [255 0 0];
pre = '<HTML><FONT color="';
post = '</FONT></HTML>';
listboxStr_ref = cell(numel( most_aligned_ref ),1);
listboxStr_sam = cell(numel( most_aligned_sam ),1);
str_ref = [pre reshape( dec2hex(most_aligned_ref.Color, 2)',1, 6) '">' most_aligned_ref.name post];
listboxStr_ref = str_ref;
Show_most_aligned_ref = {listboxStr_ref};
str_sam = [pre reshape( dec2hex(most_aligned_sam.Color, 2)',1, 6) '">' most_aligned_sam.name post];
listboxStr_sam = str_sam;
Show_most_aligned_sam = {listboxStr_sam};
else
Show_most_aligned_ref = int2nt(mod_ref(m));
Show_most_aligned_ref = cellstr(Show_most_aligned_ref);
Show_most_aligned_sam = int2nt(most_pos_sam(m));
Show_most_aligned_sam = cellstr(Show_most_aligned_sam);
end
Result_highest_quality(1,m) = Show_most_aligned_ref;
Result_highest_quality(2,m) = Show_most_aligned_sam;
end
% In the same way, but consider the case of same length or larger reference length.
else
length_difference = ref_length - sam_length;
nobase = zeros(1,length_difference);
mod_ref = ref;
mod_sam = horzcat(sam,nobase);
for i = 1:ref_length
move_sam = circshift(mod_sam, i-1);
for j = 1:ref_length
if mod_ref(j) == move_sam(j)
aligned(j) = 1;
else
aligned(j) = 0;
end
end
quality = sum(aligned)/ref_length;
quality_family(i) = quality;
% <<<Set the range of quality you are intersted in!>>>
X = find(quality_family >= 0.2);
% Code for displaying alignment result (Aligned sequences in the range) - Do not edit!
Number_quality_in_range = numel(X);
for x = 1:Number_quality_in_range
pos_sam_in_range(Number_quality_in_range,:) = circshift(mod_sam, X(1,Number_quality_in_range)-1);
for y = 1:ref_length
if mod_ref(y) == pos_sam_in_range(x,y)
pos_aligned(x,y) = 1;
else
pos_aligned(x,y) = 0;
end
end
for z = 1:ref_length
if pos_aligned(x,z) == 0
aligned_ref.name = int2nt(mod_ref(z));
aligned_sam.name = int2nt(pos_sam_in_range(x,z));
aligned_ref.Color = [255 0 0];
aligned_sam.Color = [255 0 0];
pre1 = '<HTML><FONT color="';
post1 = '</FONT></HTML>';
listboxStr_ref1 = cell(numel( aligned_ref ),1);
listboxStr_sam1 = cell(numel( aligned_sam ),1);
str_ref1 = [pre1 reshape( dec2hex(aligned_ref.Color, 2)',1, 6) '">' aligned_ref.name post1];
listboxStr_ref1 = str_ref1;
Show_aligned_ref = {listboxStr_ref1};
str_sam1 = [pre1 reshape( dec2hex(aligned_sam.Color, 2)',1, 6) '">' aligned_sam.name post1];
listboxStr_sam1 = str_sam1;
Show_aligned_sam = {listboxStr_sam1};
else
Show_aligned_ref = int2nt(mod_ref(z));
Show_aligned_ref = cellstr(Show_aligned_ref);
Show_aligned_sam = int2nt(pos_sam_in_range(x,z));
Show_aligned_sam = cellstr(Show_aligned_sam);
end
Result_in_range(2*x-1,z) = Show_aligned_ref;
Result_in_range(2*x,z) = Show_aligned_sam;
end
end
highest_q = max(quality_family);
[l, location_highest_q] = find(quality_family==highest_q);
most_pos_sam = circshift(mod_sam, location_highest_q-1);
for k = 1:ref_length
if mod_ref(k) == most_pos_sam(k)
most_aligned(k) = 1;
else
most_aligned(k) = 0;
end
end
end
% Code for displaying alignment result (Aligned sequences that have highest quality) - Do not edit!
for n =1:ref_length
if most_aligned(n) == 0
most_aligned_ref.name = int2nt(mod_ref(n));
most_aligned_sam.name = int2nt(most_pos_sam(n));
most_aligned_ref.Color = [255 0 0];
most_aligned_sam.Color = [255 0 0];
pre = '<HTML><FONT color="';
post = '</FONT></HTML>';
listboxStr_ref = cell(numel( most_aligned_ref ),1);
listboxStr_sam = cell(numel( most_aligned_sam ),1);
str_ref = [pre reshape( dec2hex(most_aligned_ref.Color, 2)',1, 6) '">' most_aligned_ref.name post];
listboxStr_ref = str_ref;
Show_most_aligned_ref = {listboxStr_ref};
str_sam = [pre reshape( dec2hex(most_aligned_sam.Color, 2)',1, 6) '">' most_aligned_sam.name post];
listboxStr_sam = str_sam;
Show_most_aligned_sam = {listboxStr_sam};
else
Show_most_aligned_ref = int2nt(mod_ref(n));
Show_most_aligned_ref = cellstr(Show_most_aligned_ref);
Show_most_aligned_sam = int2nt(most_pos_sam(n));
Show_most_aligned_sam = cellstr(Show_most_aligned_sam);
end
Result_highest_quality(1,n) = Show_most_aligned_ref;
Result_highest_quality(2,n) = Show_most_aligned_sam;
end
end
% Displaying qualities - Do not Edit!
fprintf('The qualities '); cprintf('_red', 'in the range'); fprintf(' are'); disp(quality_family(X));
fprintf('The cells '); cprintf('_red', '"Result_in_range"'); fprintf(' showing the possible alignment matching in the range of quality you had set. \n\n\n\n');
fprintf('The '); cprintf('_red', 'highest'); fprintf(' quality is'); disp(highest_q);
fprintf('The cells '); cprintf('_red', '"Result_highest_quality"'); fprintf(' showing the most possible alignment matching between refenrence gene and your sample gene. \n\n');