8 changed files with 423 additions and 483 deletions
@ -1,78 +0,0 @@
|
||||
# -*- coding: utf-8 -*- |
||||
# Incenp.Bioutils - Incenp.org's utilities for computational biology |
||||
# Copyright © 2018,2020 Damien Goutte-Gattat |
||||
# |
||||
# This program is free software; you can redistribute it and/or modify |
||||
# it under the terms of the GNU General Public License as published by |
||||
# the Free Software Foundation; either version 3 of the License, or |
||||
# (at your option) any later version. |
||||
# |
||||
# This program is distributed in the hope that it will be useful, |
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
# GNU General Public License for more details. |
||||
# |
||||
# You should have received a copy of the GNU General Public License |
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
||||
|
||||
"""Dotter wrapper program |
||||
|
||||
Wrapper tool to call the seqtools' dotter program on any kind |
||||
of sequence files instead of only FASTA files. |
||||
""" |
||||
|
||||
from argparse import ArgumentParser |
||||
from os import unlink |
||||
from subprocess import call |
||||
from tempfile import NamedTemporaryFile |
||||
|
||||
from Bio.SeqIO import write |
||||
from incenp.bio.seq.usa import read_usa |
||||
|
||||
|
||||
def dotter(horizontal, vertical): |
||||
"""Call the dotter program on the specified sequences. |
||||
|
||||
:param horizontal: the sequence to plot on the horizontal axis |
||||
:type: :class:`Bio.SeqRecord.SeqRecord` |
||||
:param vertical: the sequence to plot on the vertical axis |
||||
:type: :class:`Bio.SeqRecord.SeqRecord` |
||||
""" |
||||
|
||||
try: |
||||
hf = NamedTemporaryFile(mode='w', delete=False) |
||||
vf = NamedTemporaryFile(mode='w', delete=False) |
||||
|
||||
write([horizontal], hf, 'fasta') |
||||
hf.close() |
||||
|
||||
write([vertical], vf, 'fasta') |
||||
vf.close() |
||||
|
||||
call(['dotter', hf.name, vf.name]) |
||||
finally: |
||||
unlink(hf.name) |
||||
unlink(vf.name) |
||||
|
||||
|
||||
def main(): |
||||
parser = ArgumentParser(description="wrapper for the dotter program") |
||||
parser.add_argument('hseq', help="the horizontal sequence, as a USA") |
||||
parser.add_argument('vseq', help="the vertical sequence, as a USA") |
||||
|
||||
args = parser.parse_args() |
||||
|
||||
try: |
||||
hseq = read_usa(args.hseq)[0] |
||||
vseq = read_usa(args.vseq)[0] |
||||
except Exception as e: |
||||
parser.error("cannot read source sequences: {}".format(e)) |
||||
|
||||
try: |
||||
dotter(hseq, vseq) |
||||
except Exception as e: |
||||
parser.error("cannot run dotter: {}".format(e)) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
main() |
@ -0,0 +1,260 @@
|
||||
# -*- coding: utf-8 -*- |
||||
# Incenp.Bioutils - Incenp.org's utilities for computational biology |
||||
# Copyright © 2020 Damien Goutte-Gattat |
||||
# |
||||
# This program is free software; you can redistribute it and/or modify |
||||
# it under the terms of the GNU General Public License as published by |
||||
# the Free Software Foundation; either version 3 of the License, or |
||||
# (at your option) any later version. |
||||
# |
||||
# This program is distributed in the hope that it will be useful, |
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
# GNU General Public License for more details. |
||||
# |
||||
# You should have received a copy of the GNU General Public License |
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
||||
"""A command-line tool to manipulate biological sequence files.""" |
||||
|
||||
import sys |
||||
|
||||
from Bio.Restriction.Restriction import RestrictionBatch |
||||
import click |
||||
from incenp.bio import __version__ |
||||
from incenp.bio.seq import utils |
||||
from incenp.bio.seq import wrappers |
||||
from incenp.bio.seq.plasmidmap import summarize_vector |
||||
from incenp.bio.seq.usa import read_usa, write_usa |
||||
from reportlab.pdfgen.canvas import Canvas |
||||
|
||||
prog_name = "seqtool" |
||||
prog_notice = f"""\ |
||||
{prog_name} {__version__} |
||||
Copyright © 2020 Damien Goutte-Gattat |
||||
|
||||
This program is released under the GNU General Public License. |
||||
See the COPYING file or <http://www.gnu.org/licenses/gpl.html>. |
||||
""" |
||||
|
||||
|
||||
@click.group(context_settings={'help_option_names': ['-h', '--help']}) |
||||
@click.version_option(version=__version__, message=prog_notice) |
||||
@click.pass_context |
||||
def seqtool(ctx): |
||||
pass |
||||
|
||||
|
||||
@seqtool.command() |
||||
@click.argument('sequences', nargs=-1) |
||||
@click.option('--output', '-o', default='genbank::stdout', metavar="USA", |
||||
help="""Write to the specified USA instead of standard |
||||
output.""") |
||||
@click.option('--name', '-n', metavar="NAME", help="Set the sequence name.") |
||||
@click.option('--accession', '-a', metavar="ACC", help="Set the sequence ID.") |
||||
@click.option('--description', '-d', help="Set the sequence description.") |
||||
@click.option('--circular', '-c', 'topology', flag_value='circular', |
||||
help="Force circular topology.") |
||||
@click.option('--linear', 'topology', flag_value='linear', default=True, |
||||
help="Force linear topology.") |
||||
@click.option('--division', '-D', metavar="DIV", |
||||
help="""The data file division to use if none is already |
||||
assigned to the sequence.""") |
||||
@click.option('--clean', '-C', is_flag=True, |
||||
help="Clean the output sequence.") |
||||
@click.option('--remove-external-features', '-r', is_flag=True, |
||||
help="Remove features referring to an external sequence.") |
||||
@click.pass_context |
||||
def cat(ctx, sequences, output, name, accession, description, topology, |
||||
division, clean, remove_external_features): |
||||
"""Read and write sequences. |
||||
|
||||
This tool reads the specified input SEQUENCES and catenate them into |
||||
a single written to standard output. |
||||
""" |
||||
|
||||
inputs = [] |
||||
for f in sequences: |
||||
try: |
||||
inputs.extend(read_usa(f)) |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot read {f}: {e}") |
||||
|
||||
if len(inputs) == 0: |
||||
ctx.exit() |
||||
|
||||
outrec = inputs[0] |
||||
for record in inputs[1:]: |
||||
outrec += record |
||||
|
||||
if clean: |
||||
utils.clean(outrec, div=division, topology=topology) |
||||
if remove_external_features: |
||||
utils.remove_external_features(outrec) |
||||
if name: |
||||
outrec.name = name |
||||
if accession: |
||||
outrec.id = accession |
||||
if description: |
||||
outrec.description = description |
||||
|
||||
try: |
||||
write_usa([outrec], output) |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot write to {output}: {e}") |
||||
|
||||
|
||||
@seqtool.command() |
||||
@click.argument('source') |
||||
@click.option('--output', '-o', metavar="USA", default='fasta::stdout', |
||||
help="""Write to the specified USA instead of standard |
||||
output.""") |
||||
@click.pass_context |
||||
def siresist(ctx, source, output): |
||||
"""Silently mutate a CDS. |
||||
|
||||
Creates a variant of the SOURCE sequence with silent mutations. |
||||
""" |
||||
|
||||
try: |
||||
source = read_usa(source)[0] |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot read {source}: {e}") |
||||
|
||||
source.seq = utils.silently_mutate(source.seq) |
||||
|
||||
try: |
||||
write_usa([source], output) |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot write to {output}: {e}") |
||||
|
||||
|
||||
@seqtool.command() |
||||
@click.argument('source') |
||||
@click.argument('destination') |
||||
@click.option('--output', '-o', metavar="USA", |
||||
help="""Write to the specified USA instead of standard |
||||
output.""") |
||||
@click.option('--reaction', '-r', type=click.Choice(['BP', 'LR']), |
||||
default='LR', |
||||
help="Specify the type of Gateway reaction.") |
||||
@click.option('--name', '-n', metavar="NAME", help="Set the sequence name.") |
||||
@click.option('--accession', '-a', metavar="ACC", help="Set the sequence ID.") |
||||
@click.option('--description', '-d', help="Set the sequence description.") |
||||
@click.option('--clean', '-c', is_flag=True, help="Clean the output sequence.") |
||||
@click.pass_context |
||||
def gateway(ctx, source, destination, output, reaction, name, accession, |
||||
description, clean): |
||||
"""Perform in-silico Gateway cloning.""" |
||||
|
||||
try: |
||||
source = read_usa(source)[0] |
||||
destination = read_usa(destination)[0] |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot read input sequences: {e}") |
||||
|
||||
clone = utils.gateway(source, destination, reaction, sys.stderr) |
||||
if not clone: |
||||
raise click.ClickException("Gateway cloning not possible") |
||||
|
||||
clone.name = name or source.name |
||||
clone.id = accession or source.id |
||||
clone.description = description or source.description |
||||
clone.annotations['date'] = utils.genbank_date() |
||||
|
||||
if clean: |
||||
utils.clean(clone) |
||||
|
||||
try: |
||||
write_usa([clone], output) |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot write output: {e}") |
||||
|
||||
|
||||
@seqtool.command() |
||||
@click.argument('sequences') |
||||
@click.option('--output', '-o', metavar="FILE", default='plasmm.pdf', |
||||
help="Write to the specified file.", show_default=True) |
||||
@click.option('--enzymes', '-e', help="Specify the enzymes to display.") |
||||
@click.pass_context |
||||
def plasmm(ctx, sequences, output, enzymes): |
||||
"""Draw plasmid maps.""" |
||||
|
||||
if enzymes: |
||||
_enzymes = RestrictionBatch() |
||||
for enz in enzymes.split(','): |
||||
_enzymes.add(enz) |
||||
|
||||
records = [] |
||||
for f in sequences: |
||||
try: |
||||
records.extend(read_usa(f)) |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot read {f}: {e}") |
||||
|
||||
c = Canvas(output) |
||||
for record in records: |
||||
if record.annotations.get('molecule_type', 'DNA') == 'protein': |
||||
continue |
||||
summarize_vector(c, record) |
||||
|
||||
try: |
||||
c.save() |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot write output: {e}") |
||||
|
||||
|
||||
@seqtool.command() |
||||
@click.argument('subject') |
||||
@click.argument('query') |
||||
@click.option('--type', '-t', 'blast_type', default='blastn', |
||||
type=click.Choice(['blastn', 'blastp', 'blastx', 'tblastn', |
||||
'tblastx']), |
||||
help="The type of alignment to perform.") |
||||
@click.option('--database', '-d', is_flag=True, |
||||
help="Treat SUBJECT as a database name.") |
||||
@click.option('--short', '-s', is_flag=True, |
||||
help="Optimize BLAST for short matches.") |
||||
def blast(ctx, subject, query, blast_type, database, short): |
||||
"""Wrapper for the BLAST programs. |
||||
|
||||
SUBJECT and QUERY should be USAs representing the subject and query |
||||
sequences, respectively. |
||||
""" |
||||
|
||||
try: |
||||
if not database: |
||||
subject = read_usa(subject)[0] |
||||
query = read_usa(query)[0] |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot read source sequences: {e}") |
||||
|
||||
try: |
||||
stdout, _ = wrappers.blast(query, subject, blast_type, database, short) |
||||
print(stdout) |
||||
except BrokenPipeError: |
||||
pass |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot run blast: {e}") |
||||
|
||||
|
||||
@seqtool.command() |
||||
@click.argument('hseq') |
||||
@click.argument('vseq') |
||||
@click.pass_context |
||||
def dotter(ctx, hseq, vseq): |
||||
"""Wrapper for the dotter program.""" |
||||
|
||||
try: |
||||
hseq = read_usa(hseq)[0] |
||||
vseq = read_usa(vseq)[0] |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot read source sequences: {e}") |
||||
|
||||
try: |
||||
wrappers.dotter(hseq, vseq) |
||||
except Exception as e: |
||||
raise click.ClickException(f"Cannot run dotter: {e}") |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
seqtool() |
@ -1,166 +0,0 @@
|
||||
# -*- coding: utf-8 -*- |
||||
# Incenp.Bioutils - Incenp.org's utilities for computational biology |
||||
# Copyright © 2018,2020 Damien Goutte-Gattat |
||||
# |
||||
# This program is free software; you can redistribute it and/or modify |
||||
# it under the terms of the GNU General Public License as published by |
||||
# the Free Software Foundation; either version 3 of the License, or |
||||
# (at your option) any later version. |
||||
# |
||||
# This program is distributed in the hope that it will be useful, |
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
# GNU General Public License for more details. |
||||
# |
||||
# You should have received a copy of the GNU General Public License |
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
||||
|
||||
"""Utility functions for working on biological sequences |
||||
|
||||
This module is intended to provide various helper functions |
||||
for working on sequence files. |
||||
""" |
||||
|
||||
from argparse import ArgumentParser |
||||
from datetime import datetime |
||||
|
||||
from incenp.bio.seq.usa import read_usa, write_usa |
||||
|
||||
|
||||
def genbank_date(date=None): |
||||
if date is None: |
||||
date = datetime.now() |
||||
return date.strftime('%d-%b-%Y').upper() |
||||
|
||||
|
||||
def remove_external_features(record): |
||||
for feature in [f for f in record.features[:] if f.ref is not None]: |
||||
record.features.remove(feature) |
||||
|
||||
|
||||
def clean(record, div='UNK', topology='linear'): |
||||
# Set the data division if needed |
||||
if (not 'data_file_division' in record.annotations |
||||
or record.annotations['data_file_division'].isspace()): |
||||
record.annotations['data_file_division'] = div |
||||
|
||||
# Fix topology |
||||
if not 'topology' in record.annotations: |
||||
record.annotations['topology'] = topology |
||||
|
||||
# Fix missing molecule type |
||||
if not 'molecule_type' in record.annotations: |
||||
record.annotations['molecule_type'] = guess_molecule_type(record.seq) |
||||
|
||||
# Translate CDS |
||||
for cds in [f for f in record.features if f.type == 'CDS']: |
||||
if 'translation' in cds.qualifiers: |
||||
continue |
||||
if record.annotations['molecule_type'] == 'protein': |
||||
continue |
||||
seq = record.seq[cds.location.start.position:cds.location.end.position] |
||||
if cds.strand == -1: |
||||
seq = seq.reverse_complement() |
||||
cds.qualifiers['translation'] = str(seq.translate()) |
||||
|
||||
# Remove Ugene stuff |
||||
for feature in [f for f in record.features[:] if 'ugene_name' in f.qualifiers]: |
||||
# Restriction site features |
||||
if 'cut' in feature.qualifiers: |
||||
record.features.remove(feature) |
||||
|
||||
# Fragments |
||||
if 'ugene_group' in feature.qualifiers and 'fragments' in feature.qualifiers['ugene_group']: |
||||
record.features.remove(feature) |
||||
|
||||
# Source |
||||
if 'source' in feature.qualifiers['ugene_name']: |
||||
record.features.remove(feature) |
||||
|
||||
feature.qualifiers.pop('ugene_name', None) |
||||
feature.qualifiers.pop('ugene_group', None) |
||||
|
||||
# Remove SerialCloner stuff |
||||
for feature in record.features: |
||||
feature.qualifiers.pop('SerialCloner_Color', None) |
||||
feature.qualifiers.pop('SerialCloner_Show', None) |
||||
feature.qualifiers.pop('SerialCloner_Protect', None) |
||||
feature.qualifiers.pop('SerialCloner_Arrow', None) |
||||
|
||||
|
||||
_alphabets = { |
||||
'DNA': 'ACGTRYSWKMBDHVN', |
||||
'RNA': 'ACGURYSWKMBDHVN', |
||||
'protein': 'ACDEFGHIKLMNPQRSTVWY*' |
||||
} |
||||
|
||||
|
||||
def guess_molecule_type(sequence, alphabets=_alphabets): |
||||
seqset = set(str(sequence).upper()) |
||||
for typename in alphabets.keys(): |
||||
leftover = seqset - set(alphabets[typename]) |
||||
if not leftover: |
||||
return typename |
||||
return 'DNA' |
||||
|
||||
|
||||
def main(): |
||||
parser = ArgumentParser(description="read and write sequences") |
||||
|
||||
igroup = parser.add_argument_group(title="input/output") |
||||
igroup.add_argument('sequences', nargs='+', |
||||
help="the input sequences, as USAs") |
||||
igroup.add_argument('-o', '--output', nargs='?', default='genbank::stdout', |
||||
help="""write to the specified file (as USA) instead |
||||
of standard output""") |
||||
|
||||
sgroup = parser.add_argument_group(title="output sequence properties") |
||||
sgroup.add_argument('-n', '--name', help="set the sequence name") |
||||
sgroup.add_argument('-a', '--accession', help="set the sequence ID") |
||||
sgroup.add_argument('-d', '--description', help="set the sequence description") |
||||
sgroup.add_argument('-c', '--circular', action='store_const', |
||||
dest='topology', const='circular', default='linear', |
||||
help="force circular topology if needed") |
||||
sgroup.add_argument('-D', '--division', nargs='?', |
||||
help="""data file division to use if none is already |
||||
assigned to the sequence""") |
||||
|
||||
ogroup = parser.add_argument_group(title="optional features") |
||||
ogroup.add_argument('-C', '--clean', action='store_true', |
||||
help="clean the output sequence") |
||||
ogroup.add_argument('-r', '--remove-external-features', dest='remove_ext_feat', |
||||
action='store_true', |
||||
help="remove features referring to an external sequence") |
||||
|
||||
args = parser.parse_args() |
||||
|
||||
inputs = [] |
||||
for f in args.sequences: |
||||
try: |
||||
inputs.extend(read_usa(f)) |
||||
except Exception as e: |
||||
parser.error("cannot read {}: {}".format(f, e)) |
||||
|
||||
output = inputs[0] |
||||
for record in inputs[1:]: |
||||
output += record |
||||
|
||||
if args.clean: |
||||
clean(output, div=args.division, topology=args.topology) |
||||
if args.remove_ext_feat: |
||||
remove_external_features(output) |
||||
if args.name: |
||||
output.name = args.name |
||||
if args.accession: |
||||
output.id = args.accession |
||||
if args.description: |
||||
output.description = args.description |
||||
|
||||
try: |
||||
write_usa([output], args.output) |
||||
except Exception as e: |
||||
parser.error("cannot write to {}: {}".format(args.output, e)) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
main() |
@ -1,97 +0,0 @@
|
||||
# -*- coding: utf-8 -*- |
||||
# Incenp.Bioutils - Incenp.org's utilities for computational biology |
||||
# Copyright © 2018,2020 Damien Goutte-Gattat |
||||
# |
||||
# This program is free software; you can redistribute it and/or modify |
||||
# it under the terms of the GNU General Public License as published by |
||||
# the Free Software Foundation; either version 3 of the License, or |
||||
# (at your option) any later version. |
||||
# |
||||
# This program is distributed in the hope that it will be useful, |
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
# GNU General Public License for more details. |
||||
# |
||||
# You should have received a copy of the GNU General Public License |
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
||||
|
||||
"""Functions to work on siRNA and similar stuff.""" |
||||
|
||||
import argparse |
||||
from random import sample |
||||
|
||||
from Bio.Data import CodonTable |
||||
from Bio.Seq import MutableSeq |
||||
from incenp.bio.seq.usa import read_usa, write_usa |
||||
|
||||
_reverse_codon_tables = {} |
||||
|
||||
|
||||
def _build_reverse_codon_table(code): |
||||
rt = {} |
||||
for residue in code.forward_table.values(): |
||||
codons_list = [] |
||||
for k, v in code.forward_table.items(): |
||||
if v == residue: |
||||
codons_list.append(k) |
||||
rt[residue] = codons_list |
||||
return rt |
||||
|
||||
|
||||
def silently_mutate(sequence, code=None): |
||||
"""Mutate a CDS without changing its translation. |
||||
|
||||
Generate a mutated sequence by altering all possible codons |
||||
without introducing any changes in the aminoacids translation. |
||||
|
||||
:param sequence: the sequence to mutate |
||||
:type: :class:`Bio.Seq.Seq` |
||||
:param code: the codon table to use (defaults to the “standard” genetic code) |
||||
:type: :class:`Bio.Data.CodonTable` |
||||
:return: the mutated sequence |
||||
:rtype: :class:`Bio.Seq.Seq` |
||||
""" |
||||
|
||||
if not code: |
||||
code = CodonTable.unambiguous_dna_by_name['Standard'] |
||||
if not code in _reverse_codon_tables: |
||||
_reverse_codon_tables[code] = _build_reverse_codon_table(code) |
||||
rt = _reverse_codon_tables[code] |
||||
|
||||
output = MutableSeq('') |
||||
for i in range(int(len(sequence) / 3)): |
||||
codon = str(sequence[i * 3:(i * 3) + 3]) |
||||
if codon in code.forward_table: |
||||
residue = code.forward_table[codon] |
||||
possible_codons = rt[residue] |
||||
if len(possible_codons) > 1: |
||||
codon = sample([a for a in possible_codons if a != codon], 1)[0] |
||||
output += codon |
||||
|
||||
return output |
||||
|
||||
|
||||
def main(): |
||||
parser = argparse.ArgumentParser(description="silently mutate a CDS") |
||||
parser.add_argument('source', help="the source sequence, as a USA") |
||||
parser.add_argument('--output', '-o', nargs='?', default='fasta::stdout', |
||||
help="""write to the specified file (as USA) instead |
||||
of standard output""") |
||||
|
||||
args = parser.parse_args() |
||||
|
||||
try: |
||||
source = read_usa(args.source)[0] |
||||
except Exception as e: |
||||
parser.error("cannot read {}: {}".format(args.source, e)) |
||||
|
||||
source.seq = silently_mutate(source.seq) |
||||
|
||||
try: |
||||
write_usa([source], args.output) |
||||
except Exception as e: |
||||
parser.error("cannot write to {}: {}".format(args.output, e)) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
main() |
Loading…
Reference in new issue