From ab6e617215682d83c83eb4f6b7ac45784e0a41a6 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Thu, 10 Dec 2020 12:38:15 +0000 Subject: [PATCH] Rework the seqvault command. Move the seqvault command to a separate Click-based module. --- incenp/bio/seq/seqvault.py | 238 +++++++++++++++++++++++++++++++++++++ incenp/bio/seq/vault.py | 230 +---------------------------------- setup.py | 6 +- 3 files changed, 243 insertions(+), 231 deletions(-) create mode 100644 incenp/bio/seq/seqvault.py diff --git a/incenp/bio/seq/seqvault.py b/incenp/bio/seq/seqvault.py new file mode 100644 index 0000000..a124dbd --- /dev/null +++ b/incenp/bio/seq/seqvault.py @@ -0,0 +1,238 @@ +# -*- coding: utf-8 -*- +# Incenp.Bioutils - Incenp.org's utilities for computational biology +# Copyright © 2020 Damien Goutte-Gattat +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +"""A tool to access a BioSQL-based sequence database.""" + +import sys +from configparser import ConfigParser +from hashlib import md5 +from subprocess import run +from tempfile import NamedTemporaryFile + +from Bio import SeqIO +from BioSQL.BioSeqDatabase import open_database +import click +from click_shell import shell +from incenp.bio import __version__ +from incenp.bio.seq import vault +from incenp.bio.seq.usa import read_usa, write_usa + +prog_name = "seqvault" +prog_notice = f"""\ +{prog_name} {__version__} +Copyright © 2020 Damien Goutte-Gattat + +This program is released under the GNU General Public License. +See the COPYING file or . +""" + + +def _get_database(ctx, _, value): + try: + return ctx.obj[value] + except KeyError: + raise click.BadParameter(f"No {value!r} database on the server") + + +@shell(context_settings={'help_option_names': ['-h', '--help']}, + prompt=f"{prog_name}> ") +@click.option('--config', '-c', type=click.Path(exists=True), + default='{}/seqvault.rc'.format(click.get_app_dir('seqvault')), + help="Path to the configuration file.") +@click.option('--driver', metavar="DRIVER", + help="Specify the database driver.") +@click.option('--host', '-H', metavar="HOST", + help="Specify the database host.") +@click.option('--user', '-u', metavar="USER", + help="Specify the database user name.") +@click.option('--name', '-n', metavar="NAME", + help="Specify the database name.") +@click.version_option(version=__version__, message=prog_notice) +@click.pass_context +def seqvault(ctx, config, driver, host, user, name): + """Access a BioSQL sequence database.""" + + cfg = ConfigParser() + cfg.add_section('Server') + cfg.set('Server', 'driver', 'psycopg2') + cfg.set('Server', 'host', 'localhost') + cfg.set('Server', 'user', 'seqvault') + cfg.set('Server', 'database', 'seqvault') + cfg.read(config) + + if driver: + cfg.set('Server', 'driver', driver) + if host: + cfg.set('Server', 'host', host) + if user: + cfg.set('Server', 'user', user) + if name: + cfg.set('Server', 'database', name) + + server = open_database(**dict(cfg.items('Server'))) + server.__class__ = vault.Server + + ctx.obj = server + + +@seqvault.command() +@click.pass_obj +def listdb(server): + """List databases. + + This command prints information about the databases available on the + server. + """ + + print("NAME PREFIX ENTRIES") + for db in server.values(): + print(f"{db.name:16}{db.get_prefix():8}{len(db)}") + + +@seqvault.command() +@click.argument('database', callback=_get_database) +@click.option('--output', '-o', metavar="USA", default='fasta::stdout', + help="Write to the specified USA instead of standard output.") +def export(database, output): + """Export sequences from a database. + + This command exports all the sequences contained in the specified + DATABASE. + """ + + write_usa(database.get_unique_Seqs(), output) + + +@seqvault.command('list') +@click.argument('database', callback=_get_database) +@click.option('--all', '-a', 'show_all', is_flag=True, + help="Include obsolete sequences.") +def list_records(database, show_all): + """List database contents. + + This command list all the sequences contained in the specified + DATABASE. + """ + + if show_all: + entries = database.values() + else: + entries = database.get_unique_seqs() + for entry in entries: + print(f"{entry.name:17}{entry.id:15}{entry.description}") + + +@seqvault.command() +@click.argument('accessions', nargs=-1) +@click.option('--output', '-o', metavar="USA", default='fasta::stdout', + help="Write to the specified USA instead of standard output.") +@click.pass_obj +def get(server, accessions, output): + """Extract sequences from a database. + + This command extract sequences with the specified ACCESSIONS from + any database on the server. + """ + + records = [] + for accession in accessions: + records.append(server.get_Seq_by_accession(accession)) + if len(records) > 0: + write_usa(records, output) + + +@seqvault.command() +@click.argument('database', callback=_get_database) +@click.argument('sequences', nargs=-1) +@click.pass_obj +def add(server, database, sequences): + """Add sequences to a database. + + This command imports the specified SEQUENCES (as USAs) into the + specified DATABASE. + """ + + try: + records = [] + for usa in sequences: + records.extend(read_usa(usa)) + except Exception as e: + raise click.ClickException(f"Cannot read sequences: {e}") + + try: + database.load(records) + server.commit() + except Exception as e: + raise click.ClickException(f"Cannot load sequences: {e}") + + # Extract newly inserted records and write them out + extracted = [] + try: + for record in records: + rid = str(record.annotations['gi']) + extracted.append(database.lookup(gi=rid)) + write_usa(extracted, 'genbank::stdout') + except Exception as e: + raise click.ClickException(f"Cannot write sequences: {e}") + + +@seqvault.command() +@click.argument('accession') +@click.option('--editor', '-e', default='/usr/bin/gvim --nofork', + help="The editor command to use.") +@click.option('--read-only', '-r', is_flag=True, + help="View the record only, do not store back any change.") +@click.pass_obj +def edit(server, accession, editor, read_only): + """Edit a record. + + This command extracts the sequence with the specified ACCESSION + number and fires up an external editor to view and edit the + sequence before saving any changes back to the database. + """ + + record = server.get_Seq_by_accession(accession) + + tmpfile = NamedTemporaryFile(mode='w', delete=False) + SeqIO.write(record, tmpfile, 'genbank') + tmpfile.close() + + if not read_only: + h1 = md5(open(tmpfile.name, 'rb').read()).hexdigest() + + command = editor.split() + command.append(tmpfile.name) + + run(command) + + if not read_only: + h2 = md5(open(tmpfile.name, 'rb').read()).hexdigest() + if h1 != h2: + new_record = SeqIO.read(tmpfile.name, 'genbank') + db = server.get_database_by_prefix(new_record.id[:3]) + db.load([new_record]) + server.commit() + + extracted = db.lookup(gi=str(new_record.annotations['gi'])) + write_usa([extracted], 'genbank::stdout') + + +if __name__ == '__main__': + try: + seqvault() + except Exception as e: + print(f"seqvault: Unexpected error: {e}", file=sys.stderr) diff --git a/incenp/bio/seq/vault.py b/incenp/bio/seq/vault.py index 0172926..8a3d403 100644 --- a/incenp/bio/seq/vault.py +++ b/incenp/bio/seq/vault.py @@ -17,20 +17,9 @@ """Access a BioSQL-based sequence vault.""" -from argparse import ArgumentParser -from configparser import ConfigParser -from hashlib import md5 -from os import getenv -from subprocess import run -from tempfile import NamedTemporaryFile - -from Bio import SeqIO from BioSQL.BioSeq import DBSeqRecord -from BioSQL.BioSeqDatabase import open_database, BioSeqDatabase, DBServer +from BioSQL.BioSeqDatabase import BioSeqDatabase, DBServer from BioSQL.Loader import DatabaseLoader -from IPython import embed -from incenp.bio.seq.usa import read_usa, write_usa -from incenp.helpers.subcommands import Command, CommandList class Server(DBServer): @@ -216,220 +205,3 @@ class Database(BioSeqDatabase): db_loader.load_seqrecord(record) return num_records - - -class ListDatabaseCommand(Command): - - def __init__(self): - super(ListDatabaseCommand, self).__init__('listdb', "list databases") - - def prepare_parser(self, subparser): - subparser.add_argument('database', nargs='?', default=None, help="show only the specified database") - - def execute(self, args): - if args.database: - db = args.server[args.database] - print("{:16s}{:d} sequences".format(db.name, len(db))) - else: - print("{:16s}{:8s}{:8s}".format('NAME', 'PREFIX', 'ENTRIES')) - for db in args.server.values(): - print("{:16s}{:8s}{:d}".format(db.name, db.get_prefix(), len(db))) - - -class GetRecordCommand(Command): - - def __init__(self): - Command.__init__(self, 'get', "extract records from a database") - - def prepare_parser(self, subparser): - subparser.add_argument('accessions', nargs='+', help="accession number(s)") - - def execute(self, args): - records = [] - for accession in args.accessions: - records.append(args.server.get_Seq_by_accession(accession)) - - for record in records: - print("{} - {}".format(record.name, len(record))) - - -class ListRecordCommand(Command): - - def __init__(self): - Command.__init__(self, 'list', "list database contents") - - def prepare_parser(self, subparser): - subparser.add_argument('database', help="the database to list") - subparser.add_argument('-a', '--all', dest='show_all', action='store_true', - help="include obsolete sequences") - - def execute(self, args): - db = args.server[args.database] - - if args.show_all: - entries = db.values() - else: - entries = db.get_unique_seqs() - for entry in entries: - print("{:17s}{:15s}{}".format(entry.name, entry.id, entry.description)) - - -class AddRecordCommand(Command): - - def __init__(self): - Command.__init__(self, 'add', "add records to a database") - - def prepare_parser(self, subparser): - subparser.add_argument('database', help="the database to add records to") - subparser.add_argument('records', nargs='+', help="the records to add, as USAs") - - def execute(self, args): - db = args.server[args.database] - - try: - records = [] - for usa in args.records: - print("Reading usa {}".format(usa)) - records.extend(read_usa(usa)) - except Exception as e: - raise Exception("cannot read sequences: {}".format(e)) - - try: - db.load(records) - args.server.commit() - except Exception as e: - raise Exception("cannot load sequences: {}".format(e)) - - # Extract newly inserted records and write them out - extracted_records = [] - try: - for record in records: - rid = str(record.annotations['gi']) - extracted_records.append(db.lookup(gi=rid)) - write_usa(extracted_records, 'genbank::stdout') - except Exception as e: - raise Exception("cannot write sequences: {}".format(e)) - - -class EditRecordCommand(Command): - - def __init__(self): - Command.__init__(self, 'edit', "edit a record") - - def prepare_parser(self, subparser): - subparser.add_argument('record', help="accession number of the record to edit") - subparser.add_argument('--editor', '-e', default='/usr/bin/gvim --nofork', - help="the editor command to use") - subparser.add_argument('--view-only', '-v', action='store_true', dest='readonly', - help="view the record only, do not store back any change") - - def execute(self, args): - record = args.server.get_Seq_by_accession(args.record) - - tmpfile = NamedTemporaryFile(mode='w', delete=False) - SeqIO.write(record, tmpfile, 'genbank') - tmpfile.close() - - if not args.readonly: - h1 = md5(open(tmpfile.name, 'rb').read()).hexdigest() - - command = args.editor.split() - command.append(tmpfile.name) - - run(command) - - if not args.readonly: - h2 = md5(open(tmpfile.name, 'rb').read()).hexdigest() - if h1 != h2: - new_record = SeqIO.read(tmpfile.name, 'genbank') - db = args.server.get_database_by_prefix(new_record.id[:3]) - db.load([new_record]) - args.server.commit() - - extracted_record = db.lookup(gi=str(new_record.annotations['gi'])) - write_usa([extracted_record], 'genbank::stdout') - - -class ExportCommand(Command): - - def __init__(self): - Command.__init__(self, 'export', "export all sequences from a database") - - def prepare_parser(self, subparser): - subparser.add_argument('database', help="the database to export") - subparser.add_argument('--output', '-o', default='fasta::stdout', - help="where to write the exported sequences, as a USA") - - def execute(self, args): - db = args.server[args.database] - write_usa(db.get_unique_Seqs(), args.output) - - -class ShellCommand(Command): - - def __init__(self): - Command.__init__(self, 'shell', "open a IPython shell") - - def execute(self, args): - server = args.server - embed() - - -def main(): - - home_dir = getenv('HOME', default='') - config_dir = getenv('XDG_CONFIG_HOME', default='{}/.config'.format(home_dir)) - config_file = '{}/seqvault/seqvault.rc'.format(config_dir) - - parser = ArgumentParser(description="access a BioSQL sequence vault") - parser.add_argument('--config', '-c', default=config_file, - help="path to the configuration file") - - db_group = parser.add_argument_group("database options") - db_group.add_argument('--driver', default=None, help="database driver") - db_group.add_argument('--host', default=None, help="database host") - db_group.add_argument('--user', default=None, help="database user") - db_group.add_argument('--name', default=None, help="database name") - - CommandList(parser.add_subparsers(dest='command', required=True), - [ListDatabaseCommand(), - GetRecordCommand(), - AddRecordCommand(), - ListRecordCommand(), - EditRecordCommand(), - ShellCommand(), - ExportCommand() - ]) - - args = parser.parse_args() - - config = ConfigParser() - config.add_section('Server') - config.set('Server', 'driver', 'psycopg2') - config.set('Server', 'host', 'localhost') - config.set('Server', 'user', 'seqvault') - config.set('Server', 'database', 'seqvault') - config.read(args.config) - - if args.driver: - config.set('Server', 'driver', args.driver) - if args.host: - config.set('Server', 'host', args.host) - if args.user: - config.set('Server', 'user', args.user) - if args.name: - config.set('Server', 'database', args.name) - - server = open_database(**dict(config.items('Server'))) - server.__class__ = Server - - args.server = server - - try: - args.func(args) - except Exception as e: - parser.exit(1, "{}: unknown error: {}".format(parser.prog, e)) - - -if __name__ == '__main__': - main() diff --git a/setup.py b/setup.py index 8e8bae3..dd4e16f 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,8 @@ setup( ], install_requires=[ - 'click' + 'click', + 'click_shell' ], packages=[ @@ -50,7 +51,8 @@ setup( entry_points={ 'console_scripts': [ - 'seqtool = incenp.bio.seq.seqtool:main', + 'seqtool = incenp.bio.seq.seqtool:seqtool', + 'seqvault = incenp.bio.seq.seqvault:seqvault', 'cc3d-runner = incenp.bio.modelling.cc3d:main' ] }