function [chr, pos, strand, seq, m, mm, pv, qv, gene_forward, ... gene_forward_dist, gene_backward, gene_backward_dist] = ... search_genome_for_seq(genomic_fa_file, query, max_mm, ... also_rev_comp, gene_info, gene_window) % Finds all matches of the given query sequence in the given reference % genome. The reference genome is given as the path of the fasta file. The % query is given as a string sequence. % % - genomic_fa_file: [string] path to the genomic fasta file % - query: [string] query sequence % - max_mm: [integer] max mismatches allowed % - also_rev_comp: [boolean] also search for the reverse complement? % - gene_info: [structure] Gene info structure with the fields: % - Chr % - Start % - End % - Strand % - Transcript % - gene_window: [int] Search for genes within this distance % [ref_chr, ref_seq] = fastaread(genomic_fa_file); for c = 1:length(ref_chr) ref_chr{c} = regexp(ref_chr{c}, ' ', 'split'); ref_chr{c} = ref_chr{c}{1}; ref_seq{c} = upper(ref_seq{c}); end query = upper(query); rev_query = reversesequence(query); % Initialise results chr = cell(1000, 1); pos = zeros(1000, 1); strand = ones(1000, 1); seq = cell(1000, 1); m = zeros(1000, 1); mm = zeros(1000, 1); pv = zeros(1000, 1); qv = zeros(1000, 1); gene_forward = cell(1000, 1); gene_forward_dist = inf(1000, 1); gene_backward = cell(1000, 1); gene_backward_dist = inf(1000, 1); % mp is a counter of match positions mp = 0; query_len = length(query); % Loop over chromosomes for c = 1:length(ref_chr) fprintf('%s\n', ref_chr{c}); for p = 1:length(ref_seq{c}) if (mod(p, 1000000) == 0) fprintf('%i - matches = %i\n', p, mp); end % Search for the query in its forward form if (p <= length(ref_seq{c}) - query_len + 1) local_mm = sum(ref_seq{c}(p:(p+query_len-1)) ~= query); if (local_mm <= max_mm) mp = mp + 1; chr{mp} = ref_chr{c}; pos(mp) = p; strand(mp) = 1; seq{mp} = ref_seq{c}(p:(p+query_len-1)); m(mp) = query_len - local_mm; mm(mp) = local_mm; pv(mp) = 0; qv(mp) = 0; %pv, qv % find forward gene I = find(ismember(gene_info.Chr, ref_chr(c)) ... & gene_info.Strand == 1 ... & gene_info.Start >= p ... & gene_info.Start <= p + gene_window, ... 1, 'first'); if ~isempty(I) gene_forward{mp} = gene_info.Transcript{I}; gene_forward_dist(mp) = gene_info.Start(I) - p; else gene_forward{mp} = ''; gene_forward_dist(mp) = inf; end % find reverse gene I = find(ismember(gene_info.Chr, ref_chr(c)) ... & gene_info.Strand == -1 ... & gene_info.End <= p ... & gene_info.End >= p - gene_window, ... 1, 'last'); if ~isempty(I) gene_backward{mp} = gene_info.Transcript{I}; gene_backward_dist(mp) = p - gene_info.End(I); else gene_backward{mp} = ''; gene_backward_dist(mp) = inf; end end end % Search for the query in its reverse form (if requested) if (also_rev_comp && p >= query_len) local_mm = sum(ref_seq{c}((p-query_len+1):p) ~= rev_query); if (local_mm <= max_mm) mp = mp + 1; chr{mp} = ref_chr{c}; pos(mp) = p; strand(mp) = -1; seq{mp} = reversesequence(ref_seq{c}((p-query_len+1):p)); m(mp) = query_len - local_mm; mm(mp) = local_mm; pv(mp) = 0; qv(mp) = 0; %pv, qv % find forward gene I = find(ismember(gene_info.Chr, ref_chr(c)) ... & gene_info.Strand == 1 ... & gene_info.Start >= p ... & gene_info.Start <= p + gene_window, ... 1, 'first'); if ~isempty(I) gene_forward{mp} = gene_info.Transcript{I}; gene_forward_dist(mp) = gene_info.Start(I) - p; else gene_forward{mp} = ''; gene_forward_dist(mp) = inf; end % find reverse gene I = find(ismember(gene_info.Chr, ref_chr(c)) ... & gene_info.Strand == -1 ... & gene_info.End <= p ... & gene_info.End >= p - gene_window, ... 1, 'last'); if ~isempty(I) gene_backward{mp} = gene_info.Transcript{I}; gene_backward_dist(mp) = p - gene_info.End(I); else gene_backward{mp} = ''; gene_backward_dist(mp) = inf; end end end end end % Remove extra initialised lines in results chr(mp+1:end) = []; pos(mp+1:end) = []; strand(mp+1:end) = []; seq(mp+1:end) = []; m(mp+1:end) = []; mm(mp+1:end) = []; pv(mp+1:end) = []; qv(mp+1:end) = []; gene_forward(mp+1:end) = []; gene_forward_dist(mp+1:end) = []; gene_backward(mp+1:end) = []; gene_backward_dist(mp+1:end) = []; % Calculate pv and qv