Browse Source

Reorganize the seq utilities.

Regroup all command-line tools into a single Click-based master tool.
master
Damien Goutte-Gattat 2 years ago
parent
commit
1b4084fd91
  1. 78
      incenp/bio/seq/dotter.py
  2. 44
      incenp/bio/seq/plasmidmap.py
  3. 260
      incenp/bio/seq/seqtool.py
  4. 166
      incenp/bio/seq/sequtil.py
  5. 97
      incenp/bio/seq/silencing.py
  6. 194
      incenp/bio/seq/utils.py
  7. 56
      incenp/bio/seq/wrappers.py
  8. 11
      setup.py

78
incenp/bio/seq/dotter.py

@ -1,78 +0,0 @@
# -*- coding: utf-8 -*-
# Incenp.Bioutils - Incenp.org's utilities for computational biology
# Copyright © 2018,2020 Damien Goutte-Gattat
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Dotter wrapper program
Wrapper tool to call the seqtools' dotter program on any kind
of sequence files instead of only FASTA files.
"""
from argparse import ArgumentParser
from os import unlink
from subprocess import call
from tempfile import NamedTemporaryFile
from Bio.SeqIO import write
from incenp.bio.seq.usa import read_usa
def dotter(horizontal, vertical):
"""Call the dotter program on the specified sequences.
:param horizontal: the sequence to plot on the horizontal axis
:type: :class:`Bio.SeqRecord.SeqRecord`
:param vertical: the sequence to plot on the vertical axis
:type: :class:`Bio.SeqRecord.SeqRecord`
"""
try:
hf = NamedTemporaryFile(mode='w', delete=False)
vf = NamedTemporaryFile(mode='w', delete=False)
write([horizontal], hf, 'fasta')
hf.close()
write([vertical], vf, 'fasta')
vf.close()
call(['dotter', hf.name, vf.name])
finally:
unlink(hf.name)
unlink(vf.name)
def main():
parser = ArgumentParser(description="wrapper for the dotter program")
parser.add_argument('hseq', help="the horizontal sequence, as a USA")
parser.add_argument('vseq', help="the vertical sequence, as a USA")
args = parser.parse_args()
try:
hseq = read_usa(args.hseq)[0]
vseq = read_usa(args.vseq)[0]
except Exception as e:
parser.error("cannot read source sequences: {}".format(e))
try:
dotter(hseq, vseq)
except Exception as e:
parser.error("cannot run dotter: {}".format(e))
if __name__ == '__main__':
main()

44
incenp/bio/seq/plasmidmap.py

@ -17,14 +17,10 @@
"""Graphical description of plasmids."""
from argparse import ArgumentParser
from Bio.Graphics import GenomeDiagram
from Bio.Restriction import Analysis, RestrictionBatch
from Bio.SeqFeature import SeqFeature, FeatureLocation
from incenp.bio.seq.usa import read_usa
from reportlab.lib import colors, pagesizes
from reportlab.pdfgen.canvas import Canvas
def _get_feature_label(feature):
@ -162,43 +158,3 @@ def summarize_vector(canvas, vector):
canvas.drawText(flist)
canvas.showPage()
def main():
parser = ArgumentParser(description="draw plasmid maps")
parser.add_argument('sequences', nargs='+',
help="the input sequences, as USAs")
parser.add_argument('-o', '--output', nargs='?', default='plasmm.pdf',
help="""write to the specified file instead of
plasmm.pdf""")
parser.add_argument('-e', '--enzymes', nargs='?', default=None,
help="list of enzymes to display")
args = parser.parse_args()
if args.enzymes:
_enzymes = RestrictionBatch()
for enz in args.enzymes.split(','):
_enzymes.add(enz)
records = []
for f in args.sequences:
try:
records.extend(read_usa(f))
except Exception as e:
parser.error("cannot read {}: {}".format(f, e))
c = Canvas(args.output, pagesize=pagesizes.A4)
for record in records:
if record.annotations.get('molecule_type', 'DNA') == 'protein':
continue
summarize_vector(c, record)
try:
c.save()
except Exception as e:
parser.error("cannot write output: {}".format(e))
if __name__ == '__main__':
main()

260
incenp/bio/seq/seqtool.py

@ -0,0 +1,260 @@
# -*- coding: utf-8 -*-
# Incenp.Bioutils - Incenp.org's utilities for computational biology
# Copyright © 2020 Damien Goutte-Gattat
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""A command-line tool to manipulate biological sequence files."""
import sys
from Bio.Restriction.Restriction import RestrictionBatch
import click
from incenp.bio import __version__
from incenp.bio.seq import utils
from incenp.bio.seq import wrappers
from incenp.bio.seq.plasmidmap import summarize_vector
from incenp.bio.seq.usa import read_usa, write_usa
from reportlab.pdfgen.canvas import Canvas
prog_name = "seqtool"
prog_notice = f"""\
{prog_name} {__version__}
Copyright © 2020 Damien Goutte-Gattat
This program is released under the GNU General Public License.
See the COPYING file or <http://www.gnu.org/licenses/gpl.html>.
"""
@click.group(context_settings={'help_option_names': ['-h', '--help']})
@click.version_option(version=__version__, message=prog_notice)
@click.pass_context
def seqtool(ctx):
pass
@seqtool.command()
@click.argument('sequences', nargs=-1)
@click.option('--output', '-o', default='genbank::stdout', metavar="USA",
help="""Write to the specified USA instead of standard
output.""")
@click.option('--name', '-n', metavar="NAME", help="Set the sequence name.")
@click.option('--accession', '-a', metavar="ACC", help="Set the sequence ID.")
@click.option('--description', '-d', help="Set the sequence description.")
@click.option('--circular', '-c', 'topology', flag_value='circular',
help="Force circular topology.")
@click.option('--linear', 'topology', flag_value='linear', default=True,
help="Force linear topology.")
@click.option('--division', '-D', metavar="DIV",
help="""The data file division to use if none is already
assigned to the sequence.""")
@click.option('--clean', '-C', is_flag=True,
help="Clean the output sequence.")
@click.option('--remove-external-features', '-r', is_flag=True,
help="Remove features referring to an external sequence.")
@click.pass_context
def cat(ctx, sequences, output, name, accession, description, topology,
division, clean, remove_external_features):
"""Read and write sequences.
This tool reads the specified input SEQUENCES and catenate them into
a single written to standard output.
"""
inputs = []
for f in sequences:
try:
inputs.extend(read_usa(f))
except Exception as e:
raise click.ClickException(f"Cannot read {f}: {e}")
if len(inputs) == 0:
ctx.exit()
outrec = inputs[0]
for record in inputs[1:]:
outrec += record
if clean:
utils.clean(outrec, div=division, topology=topology)
if remove_external_features:
utils.remove_external_features(outrec)
if name:
outrec.name = name
if accession:
outrec.id = accession
if description:
outrec.description = description
try:
write_usa([outrec], output)
except Exception as e:
raise click.ClickException(f"Cannot write to {output}: {e}")
@seqtool.command()
@click.argument('source')
@click.option('--output', '-o', metavar="USA", default='fasta::stdout',
help="""Write to the specified USA instead of standard
output.""")
@click.pass_context
def siresist(ctx, source, output):
"""Silently mutate a CDS.
Creates a variant of the SOURCE sequence with silent mutations.
"""
try:
source = read_usa(source)[0]
except Exception as e:
raise click.ClickException(f"Cannot read {source}: {e}")
source.seq = utils.silently_mutate(source.seq)
try:
write_usa([source], output)
except Exception as e:
raise click.ClickException(f"Cannot write to {output}: {e}")
@seqtool.command()
@click.argument('source')
@click.argument('destination')
@click.option('--output', '-o', metavar="USA",
help="""Write to the specified USA instead of standard
output.""")
@click.option('--reaction', '-r', type=click.Choice(['BP', 'LR']),
default='LR',
help="Specify the type of Gateway reaction.")
@click.option('--name', '-n', metavar="NAME", help="Set the sequence name.")
@click.option('--accession', '-a', metavar="ACC", help="Set the sequence ID.")
@click.option('--description', '-d', help="Set the sequence description.")
@click.option('--clean', '-c', is_flag=True, help="Clean the output sequence.")
@click.pass_context
def gateway(ctx, source, destination, output, reaction, name, accession,
description, clean):
"""Perform in-silico Gateway cloning."""
try:
source = read_usa(source)[0]
destination = read_usa(destination)[0]
except Exception as e:
raise click.ClickException(f"Cannot read input sequences: {e}")
clone = utils.gateway(source, destination, reaction, sys.stderr)
if not clone:
raise click.ClickException("Gateway cloning not possible")
clone.name = name or source.name
clone.id = accession or source.id
clone.description = description or source.description
clone.annotations['date'] = utils.genbank_date()
if clean:
utils.clean(clone)
try:
write_usa([clone], output)
except Exception as e:
raise click.ClickException(f"Cannot write output: {e}")
@seqtool.command()
@click.argument('sequences')
@click.option('--output', '-o', metavar="FILE", default='plasmm.pdf',
help="Write to the specified file.", show_default=True)
@click.option('--enzymes', '-e', help="Specify the enzymes to display.")
@click.pass_context
def plasmm(ctx, sequences, output, enzymes):
"""Draw plasmid maps."""
if enzymes:
_enzymes = RestrictionBatch()
for enz in enzymes.split(','):
_enzymes.add(enz)
records = []
for f in sequences:
try:
records.extend(read_usa(f))
except Exception as e:
raise click.ClickException(f"Cannot read {f}: {e}")
c = Canvas(output)
for record in records:
if record.annotations.get('molecule_type', 'DNA') == 'protein':
continue
summarize_vector(c, record)
try:
c.save()
except Exception as e:
raise click.ClickException(f"Cannot write output: {e}")
@seqtool.command()
@click.argument('subject')
@click.argument('query')
@click.option('--type', '-t', 'blast_type', default='blastn',
type=click.Choice(['blastn', 'blastp', 'blastx', 'tblastn',
'tblastx']),
help="The type of alignment to perform.")
@click.option('--database', '-d', is_flag=True,
help="Treat SUBJECT as a database name.")
@click.option('--short', '-s', is_flag=True,
help="Optimize BLAST for short matches.")
def blast(ctx, subject, query, blast_type, database, short):
"""Wrapper for the BLAST programs.
SUBJECT and QUERY should be USAs representing the subject and query
sequences, respectively.
"""
try:
if not database:
subject = read_usa(subject)[0]
query = read_usa(query)[0]
except Exception as e:
raise click.ClickException(f"Cannot read source sequences: {e}")
try:
stdout, _ = wrappers.blast(query, subject, blast_type, database, short)
print(stdout)
except BrokenPipeError:
pass
except Exception as e:
raise click.ClickException(f"Cannot run blast: {e}")
@seqtool.command()
@click.argument('hseq')
@click.argument('vseq')
@click.pass_context
def dotter(ctx, hseq, vseq):
"""Wrapper for the dotter program."""
try:
hseq = read_usa(hseq)[0]
vseq = read_usa(vseq)[0]
except Exception as e:
raise click.ClickException(f"Cannot read source sequences: {e}")
try:
wrappers.dotter(hseq, vseq)
except Exception as e:
raise click.ClickException(f"Cannot run dotter: {e}")
if __name__ == '__main__':
seqtool()

166
incenp/bio/seq/sequtil.py

@ -1,166 +0,0 @@
# -*- coding: utf-8 -*-
# Incenp.Bioutils - Incenp.org's utilities for computational biology
# Copyright © 2018,2020 Damien Goutte-Gattat
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Utility functions for working on biological sequences
This module is intended to provide various helper functions
for working on sequence files.
"""
from argparse import ArgumentParser
from datetime import datetime
from incenp.bio.seq.usa import read_usa, write_usa
def genbank_date(date=None):
if date is None:
date = datetime.now()
return date.strftime('%d-%b-%Y').upper()
def remove_external_features(record):
for feature in [f for f in record.features[:] if f.ref is not None]:
record.features.remove(feature)
def clean(record, div='UNK', topology='linear'):
# Set the data division if needed
if (not 'data_file_division' in record.annotations
or record.annotations['data_file_division'].isspace()):
record.annotations['data_file_division'] = div
# Fix topology
if not 'topology' in record.annotations:
record.annotations['topology'] = topology
# Fix missing molecule type
if not 'molecule_type' in record.annotations:
record.annotations['molecule_type'] = guess_molecule_type(record.seq)
# Translate CDS
for cds in [f for f in record.features if f.type == 'CDS']:
if 'translation' in cds.qualifiers:
continue
if record.annotations['molecule_type'] == 'protein':
continue
seq = record.seq[cds.location.start.position:cds.location.end.position]
if cds.strand == -1:
seq = seq.reverse_complement()
cds.qualifiers['translation'] = str(seq.translate())
# Remove Ugene stuff
for feature in [f for f in record.features[:] if 'ugene_name' in f.qualifiers]:
# Restriction site features
if 'cut' in feature.qualifiers:
record.features.remove(feature)
# Fragments
if 'ugene_group' in feature.qualifiers and 'fragments' in feature.qualifiers['ugene_group']:
record.features.remove(feature)
# Source
if 'source' in feature.qualifiers['ugene_name']:
record.features.remove(feature)
feature.qualifiers.pop('ugene_name', None)
feature.qualifiers.pop('ugene_group', None)
# Remove SerialCloner stuff
for feature in record.features:
feature.qualifiers.pop('SerialCloner_Color', None)
feature.qualifiers.pop('SerialCloner_Show', None)
feature.qualifiers.pop('SerialCloner_Protect', None)
feature.qualifiers.pop('SerialCloner_Arrow', None)
_alphabets = {
'DNA': 'ACGTRYSWKMBDHVN',
'RNA': 'ACGURYSWKMBDHVN',
'protein': 'ACDEFGHIKLMNPQRSTVWY*'
}
def guess_molecule_type(sequence, alphabets=_alphabets):
seqset = set(str(sequence).upper())
for typename in alphabets.keys():
leftover = seqset - set(alphabets[typename])
if not leftover:
return typename
return 'DNA'
def main():
parser = ArgumentParser(description="read and write sequences")
igroup = parser.add_argument_group(title="input/output")
igroup.add_argument('sequences', nargs='+',
help="the input sequences, as USAs")
igroup.add_argument('-o', '--output', nargs='?', default='genbank::stdout',
help="""write to the specified file (as USA) instead
of standard output""")
sgroup = parser.add_argument_group(title="output sequence properties")
sgroup.add_argument('-n', '--name', help="set the sequence name")
sgroup.add_argument('-a', '--accession', help="set the sequence ID")
sgroup.add_argument('-d', '--description', help="set the sequence description")
sgroup.add_argument('-c', '--circular', action='store_const',
dest='topology', const='circular', default='linear',
help="force circular topology if needed")
sgroup.add_argument('-D', '--division', nargs='?',
help="""data file division to use if none is already
assigned to the sequence""")
ogroup = parser.add_argument_group(title="optional features")
ogroup.add_argument('-C', '--clean', action='store_true',
help="clean the output sequence")
ogroup.add_argument('-r', '--remove-external-features', dest='remove_ext_feat',
action='store_true',
help="remove features referring to an external sequence")
args = parser.parse_args()
inputs = []
for f in args.sequences:
try:
inputs.extend(read_usa(f))
except Exception as e:
parser.error("cannot read {}: {}".format(f, e))
output = inputs[0]
for record in inputs[1:]:
output += record
if args.clean:
clean(output, div=args.division, topology=args.topology)
if args.remove_ext_feat:
remove_external_features(output)
if args.name:
output.name = args.name
if args.accession:
output.id = args.accession
if args.description:
output.description = args.description
try:
write_usa([output], args.output)
except Exception as e:
parser.error("cannot write to {}: {}".format(args.output, e))
if __name__ == '__main__':
main()

97
incenp/bio/seq/silencing.py

@ -1,97 +0,0 @@
# -*- coding: utf-8 -*-
# Incenp.Bioutils - Incenp.org's utilities for computational biology
# Copyright © 2018,2020 Damien Goutte-Gattat
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Functions to work on siRNA and similar stuff."""
import argparse
from random import sample
from Bio.Data import CodonTable
from Bio.Seq import MutableSeq
from incenp.bio.seq.usa import read_usa, write_usa
_reverse_codon_tables = {}
def _build_reverse_codon_table(code):
rt = {}
for residue in code.forward_table.values():
codons_list = []
for k, v in code.forward_table.items():
if v == residue:
codons_list.append(k)
rt[residue] = codons_list
return rt
def silently_mutate(sequence, code=None):
"""Mutate a CDS without changing its translation.
Generate a mutated sequence by altering all possible codons
without introducing any changes in the aminoacids translation.
:param sequence: the sequence to mutate
:type: :class:`Bio.Seq.Seq`
:param code: the codon table to use (defaults to the standard genetic code)
:type: :class:`Bio.Data.CodonTable`
:return: the mutated sequence
:rtype: :class:`Bio.Seq.Seq`
"""
if not code:
code = CodonTable.unambiguous_dna_by_name['Standard']
if not code in _reverse_codon_tables:
_reverse_codon_tables[code] = _build_reverse_codon_table(code)
rt = _reverse_codon_tables[code]
output = MutableSeq('')
for i in range(int(len(sequence) / 3)):
codon = str(sequence[i * 3:(i * 3) + 3])
if codon in code.forward_table:
residue = code.forward_table[codon]
possible_codons = rt[residue]
if len(possible_codons) > 1:
codon = sample([a for a in possible_codons if a != codon], 1)[0]
output += codon
return output
def main():
parser = argparse.ArgumentParser(description="silently mutate a CDS")
parser.add_argument('source', help="the source sequence, as a USA")
parser.add_argument('--output', '-o', nargs='?', default='fasta::stdout',
help="""write to the specified file (as USA) instead
of standard output""")
args = parser.parse_args()
try:
source = read_usa(args.source)[0]
except Exception as e:
parser.error("cannot read {}: {}".format(args.source, e))
source.seq = silently_mutate(source.seq)
try:
write_usa([source], args.output)
except Exception as e:
parser.error("cannot write to {}: {}".format(args.output, e))
if __name__ == '__main__':
main()

194
incenp/bio/seq/gateway.py → incenp/bio/seq/utils.py

@ -15,17 +15,144 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Gateway cloning module."""
"""Utility functions for working on biological sequences
from argparse import ArgumentParser
from sys import stderr
This module is intended to provide various helper functions
for working on sequence files.
"""
from Bio.Seq import Seq
from datetime import datetime
from random import sample
from Bio.Data import CodonTable
from Bio.Seq import MutableSeq, Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
from incenp.bio import Error
from incenp.bio.seq.sequtil import clean, genbank_date
from incenp.bio.seq.usa import read_usa, write_usa
_reverse_codon_tables = {}
def _build_reverse_codon_table(code):
rt = {}
for residue in code.forward_table.values():
codons_list = []
for k, v in code.forward_table.items():
if v == residue:
codons_list.append(k)
rt[residue] = codons_list
return rt
def silently_mutate(sequence, code=None):
"""Mutate a CDS without changing its translation.
Generate a mutated sequence by altering all possible codons
without introducing any changes in the aminoacids translation.
:param sequence: the sequence to mutate
:type: :class:`Bio.Seq.Seq`
:param code: the codon table to use (defaults to the standard genetic code)
:type: :class:`Bio.Data.CodonTable`
:return: the mutated sequence
:rtype: :class:`Bio.Seq.Seq`
"""
if not code:
code = CodonTable.unambiguous_dna_by_name['Standard']
if not code in _reverse_codon_tables:
_reverse_codon_tables[code] = _build_reverse_codon_table(code)
rt = _reverse_codon_tables[code]
output = MutableSeq('')
for i in range(int(len(sequence) / 3)):
codon = str(sequence[i * 3:(i * 3) + 3])
if codon in code.forward_table:
residue = code.forward_table[codon]
possible_codons = rt[residue]
if len(possible_codons) > 1:
codon = sample([a for a in possible_codons if a != codon], 1)[0]
output += codon
return output
def genbank_date(date=None):
if date is None:
date = datetime.now()
return date.strftime('%d-%b-%Y').upper()
def remove_external_features(record):
for feature in [f for f in record.features[:] if f.ref is not None]:
record.features.remove(feature)
def clean(record, div='UNK', topology='linear'):
# Set the data division if needed
if (not 'data_file_division' in record.annotations
or record.annotations['data_file_division'].isspace()):
record.annotations['data_file_division'] = div
# Fix topology
if not 'topology' in record.annotations:
record.annotations['topology'] = topology
# Fix missing molecule type
if not 'molecule_type' in record.annotations:
record.annotations['molecule_type'] = guess_molecule_type(record.seq)
# Translate CDS
for cds in [f for f in record.features if f.type == 'CDS']:
if 'translation' in cds.qualifiers:
continue
if record.annotations['molecule_type'] == 'protein':
continue
seq = record.seq[cds.location.start.position:cds.location.end.position]
if cds.strand == -1:
seq = seq.reverse_complement()
cds.qualifiers['translation'] = str(seq.translate())
# Remove Ugene stuff
for feature in [f for f in record.features[:] if 'ugene_name' in f.qualifiers]:
# Restriction site features
if 'cut' in feature.qualifiers:
record.features.remove(feature)
# Fragments
if 'ugene_group' in feature.qualifiers and 'fragments' in feature.qualifiers['ugene_group']:
record.features.remove(feature)
# Source
if 'source' in feature.qualifiers['ugene_name']:
record.features.remove(feature)
feature.qualifiers.pop('ugene_name', None)
feature.qualifiers.pop('ugene_group', None)
# Remove SerialCloner stuff
for feature in record.features:
feature.qualifiers.pop('SerialCloner_Color', None)
feature.qualifiers.pop('SerialCloner_Show', None)
feature.qualifiers.pop('SerialCloner_Protect', None)
feature.qualifiers.pop('SerialCloner_Arrow', None)
_alphabets = {
'DNA': 'ACGTRYSWKMBDHVN',
'RNA': 'ACGURYSWKMBDHVN',
'protein': 'ACDEFGHIKLMNPQRSTVWY*'
}
def guess_molecule_type(sequence, alphabets=_alphabets):
seqset = set(str(sequence).upper())
for typename in alphabets.keys():
leftover = seqset - set(alphabets[typename])
if not leftover:
return typename
return 'DNA'
_gateway_sequences = {
'attB1': 'ACAAGTTTGTACAAAAAAGCAGGCT',
@ -64,7 +191,7 @@ _gateway_reactions = {
}
def _find_sites(sequence, reaction='LR', kind='source', log=None):
def _find_gateway_sites(sequence, reaction='LR', kind='source', log=None):
indexes = []
for i in [0, 1]:
name = _gateway_reactions[reaction][kind][i]
@ -98,14 +225,14 @@ def gateway(source, destination, reaction='LR', log=None):
if reaction not in _gateway_reactions:
raise Error("Invalid Gateway reaction type: {}".format(reaction))
src_indexes = _find_sites(source, reaction, 'source', log)
src_indexes = _find_gateway_sites(source, reaction, 'source', log)
if len(src_indexes) != 2:
return
source_part = source[src_indexes[0]:src_indexes[1]]
if log:
log.write("Found source region: {}..{}\n".format(src_indexes[0], src_indexes[1]))
dst_indexes = _find_sites(destination, reaction, 'target', log)
dst_indexes = _find_gateway_sites(destination, reaction, 'target', log)
if len(dst_indexes) != 2:
return
dest_parts = [destination[:dst_indexes[0]], destination[dst_indexes[1]:]]
@ -129,52 +256,3 @@ def gateway(source, destination, reaction='LR', log=None):
clone = dest_parts[0] + result_sites[0] + source_part + result_sites[1] + dest_parts[1]
return clone
def main():
parser = ArgumentParser(description="perform in-silico Gateway clonings")
parser.add_argument('source', help="the source sequence, as a USA")
parser.add_argument('destination', help="the destination sequence, as a USA")
parser.add_argument('--output', '-o', default='genbank::stdout',
help="""write to the specified file (as USA) instead
of standard output""")
parser.add_argument('--reaction', '-r', choices=['BP', 'LR'], default='LR',
help="specify the type of Gateway reaction")
sgroup = parser.add_argument_group("sequence properties")
sgroup.add_argument('-n', '--name', help="set the sequence name")
sgroup.add_argument('-a', '--accession', help="set the sequence ID")
sgroup.add_argument('-d', '--description', help="set the sequence description")
sgroup.add_argument('-c', '--clean', action='store_true',
help="clean the output sequence")
args = parser.parse_args()
try:
source = read_usa(args.source)[0]
dest = read_usa(args.destination)[0]
except Exception as e:
parser.error("cannot read input sequences: {}".format(e))
clone = gateway(source, dest, args.reaction, stderr)
if not clone:
parser.error("Gateway cloning not possible")
clone.name = args.name or source.name
clone.id = args.accession or source.id
clone.description = args.description or source.description
clone.annotations['molecule_type'] = 'DNA'
clone.annotations['topology'] = 'circular'
clone.annotations['date'] = genbank_date()
if args.clean:
clean(clone)
try:
write_usa([clone], args.output)
except Exception as e:
parser.error("cannot write output: {}".format(e))
if __name__ == '__main__':
main()

56
incenp/bio/seq/blast.py → incenp/bio/seq/wrappers.py

@ -15,19 +15,18 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""BLAST wrapper program
"""Wrappers for external programs
Wrapper tool to call NCBI BLAST+ programs on any kind
of sequence files instead of only FASTA files.
This module provides wrapper functions to call third-party sequence
manipulation tools.
"""
from argparse import ArgumentParser
from os import unlink
from subprocess import call
from tempfile import NamedTemporaryFile
from Bio.Blast import Applications
from Bio.SeqIO import write
from incenp.bio.seq.usa import read_usa
_applications = {
'blastn': Applications.NcbiblastnCommandline,
@ -86,37 +85,26 @@ def blast(query, subject, kind='blastn', is_database=False, short=False):
unlink(qf.name)
def main():
parser = ArgumentParser(description="wrapper for the BLAST programs")
parser.add_argument('subject', help="the subject sequence, as a USA")
parser.add_argument('query', help="the query sequence, as a USA")
parser.add_argument('--type', '-t',
choices=_applications.keys(), default='blastn',
help="type of alignment to perform")
parser.add_argument('--database', '-d', action='store_true',
help="treat subject as a database name")
parser.add_argument('--short', '-s', action='store_true',
help="blast for short matches")
args = parser.parse_args()
def dotter(horizontal, vertical):
"""Call the dotter program on the specified sequences.
:param horizontal: the sequence to plot on the horizontal axis
:type: :class:`Bio.SeqRecord.SeqRecord`
:param vertical: the sequence to plot on the vertical axis
:type: :class:`Bio.SeqRecord.SeqRecord`
"""
try:
if not args.database:
subject = read_usa(args.subject)[0]
else:
subject = args.subject
query = read_usa(args.query)[0]
except Exception as e:
parser.error("cannot read source sequences: {}".format(e))
hf = NamedTemporaryFile(mode='w', delete=False)
vf = NamedTemporaryFile(mode='w', delete=False)
try:
stdout, _ = blast(query, subject, args.type, args.database, args.short)
print(stdout)
except BrokenPipeError:
pass
except Exception as e:
parser.error("cannot run blast: {}".format(e))
write([horizontal], hf, 'fasta')
hf.close()
write([vertical], vf, 'fasta')
vf.close()
if __name__ == '__main__':
main()
call(['dotter', hf.name, vf.name])
finally:
unlink(hf.name)
unlink(vf.name)

11
setup.py

@ -37,6 +37,10 @@ setup(
'Programming Language :: Python :: 3'
],
install_requires=[
'click'
],
packages=[
'incenp',
'incenp.bio',
@ -46,12 +50,7 @@ setup(
entry_points={
'console_scripts': [
'sequtil = incenp.bio.seq.sequtil:main',
'siresist = incenp.bio.seq.silencing:main',
'iblast = incenp.bio.seq.blast:main',
'idotter = incenp.bio.seq.dotter:main',
'gateway = incenp.bio.seq.gateway:main',
'plasmm = incenp.bio.seq.plasmidmap:main',
'seqtool = incenp.bio.seq.seqtool:main',
'cc3d-runner = incenp.bio.modelling.cc3d:main'
]
}

Loading…
Cancel
Save