|
|
|
@ -37,75 +37,75 @@ from incenp.bio import Error |
|
|
|
|
|
|
|
|
|
class DatabaseProvider(object): |
|
|
|
|
"""Provides database adapters for biological databases. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This object provides a unique way to access a set of user-configured |
|
|
|
|
set of sequence databases. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
The databases this object gives access to should be described in a |
|
|
|
|
INI-style configuration file, located in |
|
|
|
|
INI-style configuration file, located in |
|
|
|
|
`$XDG_CONFIG_HOME/bioutils/databases.ini`. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Each section in the configuration file describes a database. Within a |
|
|
|
|
section, the mandatory ``type`` parameter indicates the type of |
|
|
|
|
database and therefore the type of database adapter to use to query |
|
|
|
|
that database. Other parameters are dependent on the database type. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Supported database types: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BioSQL database (``type: biosql``) |
|
|
|
|
Any database following the BioSQL scheme. Parameters for this type |
|
|
|
|
of database are: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
* ``driver`` indicating the SQL driver to use; |
|
|
|
|
* ``host`` for the hostname of the SQL server; |
|
|
|
|
* ``user`` for the user account to connect to the server with; |
|
|
|
|
* ``password`` for the associated password; |
|
|
|
|
* ``database`` for the SQL database name; |
|
|
|
|
* ``subdb`` for the name of the BioSQL subdatabase, if any. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ExPASy database (``type: expasy``) |
|
|
|
|
The ExPASy server. This type expects no parameter. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Entrez database (``type: entrez``) |
|
|
|
|
One of the NCBI Entrez database. Parameters for this type of |
|
|
|
|
database are: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
* ``email`` for the email address to send to the NCBI server along |
|
|
|
|
with each query; |
|
|
|
|
* ``database`` for the name of the Entrez database (can be |
|
|
|
|
``nuccore`` or ``protein``, for the DNA/RNA or protein database |
|
|
|
|
respectively). |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Here is an example configuration file:: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[db1] |
|
|
|
|
type: biosql |
|
|
|
|
host: localhost |
|
|
|
|
user: bioutils |
|
|
|
|
database: mydb |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[uniprot] |
|
|
|
|
type: expasy |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[genbank] |
|
|
|
|
type: entrez |
|
|
|
|
email: bioutils@example.org |
|
|
|
|
database: nuccore |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
With such a configuration file, the database provider can either be |
|
|
|
|
used directly:: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Create the object and parse the configuration file |
|
|
|
|
dbprovider = DatabaseProvider() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Query the ExPASy server |
|
|
|
|
dbprovider['uniprot'].fetch('NP_001800') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
or through a USA:: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Query the ExPASy server |
|
|
|
|
usa.read_usa('uniprot::NP_001800', databases=dbprovider) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
def __init__(self): |
|
|
|
@ -114,8 +114,11 @@ class DatabaseProvider(object): |
|
|
|
|
self.biosql_servers = {} |
|
|
|
|
|
|
|
|
|
cfg_file = '{}/bioutils/databases.ini'.format( |
|
|
|
|
getenv('XDG_CONFIG_HOME', default='{}/.config'.format( |
|
|
|
|
getenv('HOME', default='.')))) |
|
|
|
|
getenv( |
|
|
|
|
'XDG_CONFIG_HOME', |
|
|
|
|
default='{}/.config'.format(getenv('HOME', default='.')), |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
self.cfg.read(cfg_file) |
|
|
|
|
|
|
|
|
|
def __contains__(self, database): |
|
|
|
@ -153,8 +156,7 @@ class DatabaseProvider(object): |
|
|
|
|
if subdb: |
|
|
|
|
if not subdb in server: |
|
|
|
|
raise Error(f"No subdatabase {subdb} on server") |
|
|
|
|
adapter = BioSqlAdapter(server[subdb].adaptor, |
|
|
|
|
server[subdb].dbid) |
|
|
|
|
adapter = BioSqlAdapter(server[subdb].adaptor, server[subdb].dbid) |
|
|
|
|
else: |
|
|
|
|
adapter = BioSqlAdapter(server.adaptor) |
|
|
|
|
|
|
|
|
@ -166,8 +168,7 @@ class DatabaseProvider(object): |
|
|
|
|
email = self.cfg.get(database, 'email') |
|
|
|
|
dbname = self.cfg.get(database, 'database') |
|
|
|
|
except NoOptionError: |
|
|
|
|
raise Error("Incomplete configuration for database " |
|
|
|
|
f"{database!r}") |
|
|
|
|
raise Error("Incomplete configuration for database " f"{database!r}") |
|
|
|
|
|
|
|
|
|
if dbname not in ['nuccore', 'protein']: |
|
|
|
|
raise Error(f"Invalid database for {database!r}: {dbname}") |
|
|
|
@ -196,8 +197,9 @@ class DatabaseProvider(object): |
|
|
|
|
conn_settings['host'] = self.cfg.get(name, 'host') |
|
|
|
|
conn_settings['user'] = self.cfg.get(name, 'user') |
|
|
|
|
conn_settings['database'] = self.cfg.get(name, 'database') |
|
|
|
|
conn_settings['password'] = self.cfg.get(name, 'password', |
|
|
|
|
fallback=None) |
|
|
|
|
conn_settings['password'] = self.cfg.get( |
|
|
|
|
name, 'password', fallback=None |
|
|
|
|
) |
|
|
|
|
except NoOptionError: |
|
|
|
|
raise Error(f"Incomplete configuration for database {name!r}") |
|
|
|
|
|
|
|
|
@ -213,19 +215,19 @@ class DatabaseProvider(object): |
|
|
|
|
|
|
|
|
|
class DatabaseAdapter(object): |
|
|
|
|
"""Base class for database-specific access providers. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This class defines the common interface shared by all the database |
|
|
|
|
adapters. |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
def query(self, field, pattern): |
|
|
|
|
"""Gets records matching the specified query pattern. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This method queries the underlying database for all records |
|
|
|
|
matching the indicated pattern in the specified field. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
The *field* argument can take the following values: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
* ``acc`` to search for an accession number; |
|
|
|
|
* ``id`` to search for a record name; |
|
|
|
|
* ``sv`` to search for a versioned accession number or a |
|
|
|
@ -233,14 +235,14 @@ class DatabaseAdapter(object): |
|
|
|
|
* ``des`` to search for words in a record's description; |
|
|
|
|
* ``org`` to search for an organism; |
|
|
|
|
* ``key`` to search for words in a record's keywords. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Not all database adapters may support all those types of |
|
|
|
|
queries. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
The *pattern* argument may contain wildcards: ``?`` stands for |
|
|
|
|
any character, and ``*`` stands for any number of characters. |
|
|
|
|
Not all database adapters may support wildcards. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param field: the database field to search against |
|
|
|
|
:param pattern: the pattern to search for |
|
|
|
|
:return: the matching records, as a list of |
|
|
|
@ -253,16 +255,16 @@ class DatabaseAdapter(object): |
|
|
|
|
|
|
|
|
|
def fetch(self, identifier): |
|
|
|
|
"""Gets records matching the specified identifier. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This method queries the underlying database for all records |
|
|
|
|
whose name or accession number matches the specified pattern. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
The identifier may contain wildcards: ``?`` stands for any |
|
|
|
|
character, and ``*`` stands for any number of characters. Not |
|
|
|
|
all database adapters may support wildcards. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param identifier: the pattern to look for |
|
|
|
|
:return: the matching records, as a list of |
|
|
|
|
:return: the matching records, as a list of |
|
|
|
|
:class:`Bio.SeqRecord.SeqRecord` objects (or objects with a |
|
|
|
|
compatible interface, such as |
|
|
|
|
:class:`BioSQL.BioSeq.DBSeqRecord`) |
|
|
|
@ -272,22 +274,21 @@ class DatabaseAdapter(object): |
|
|
|
|
|
|
|
|
|
def fetchall(self): |
|
|
|
|
"""Gets all records in the database. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This method returns *all* the records contained in the |
|
|
|
|
underlying database. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Not all database adapters may support this method. In |
|
|
|
|
particular, it is expected that adapters for online databases |
|
|
|
|
will most likely not support it. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:return: the database records, as a list of |
|
|
|
|
:class:`Bio.SeqRecord.SeqRecord` objects (or objects with a |
|
|
|
|
compatible interface, such as |
|
|
|
|
:class:`BioSQL.BioSeq.DBSeqRecord`) |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
raise Error("Fetching all records from this database is not " |
|
|
|
|
"supported") |
|
|
|
|
raise Error("Fetching all records from this database is not " "supported") |
|
|
|
|
|
|
|
|
|
def close(self): |
|
|
|
|
"""Frees resources associated with the database.""" |
|
|
|
@ -312,20 +313,20 @@ def _pattern_to_sql_pattern(pattern): |
|
|
|
|
|
|
|
|
|
class BioSqlAdapter(DatabaseAdapter): |
|
|
|
|
"""Adapter for BioSQL-based sequence databases. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This adapter provides access to any biological database following |
|
|
|
|
the BioSQL schema, as supported by Biopython's ``Bio.BioSQL`` |
|
|
|
|
module. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Usage:: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from BioSQL.BioSeqDatabase import open_database |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server = open_database(...) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# For a server-wide adapter |
|
|
|
|
adapter = BioSqlAdapter(server.adaptor) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# For an adapter restricted to a subdatabase |
|
|
|
|
database = server[database_name] |
|
|
|
|
adapter = BioSqlAdapter(database.adaptor, database.dbid) |
|
|
|
@ -334,7 +335,7 @@ class BioSqlAdapter(DatabaseAdapter): |
|
|
|
|
|
|
|
|
|
def __init__(self, adaptor, dbid=None): |
|
|
|
|
"""Creates a new BioSQL database adapter. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param adaptor: a :class:`BioSQL.BioSeqDatabase.Adaptor` object |
|
|
|
|
connected to the target database |
|
|
|
|
:param dbid: a BioSQL subdatabase identifier; if ``None``, |
|
|
|
@ -350,13 +351,10 @@ class BioSqlAdapter(DatabaseAdapter): |
|
|
|
|
sql = ( |
|
|
|
|
'SELECT bioentry_id FROM bioentry ' |
|
|
|
|
'WHERE accession LIKE %s ESCAPE \'\\\'' |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
args = [_pattern_to_sql_pattern(keyword)] |
|
|
|
|
elif field == 'id': |
|
|
|
|
sql = ( |
|
|
|
|
'SELECT bioentry_id FROM bioentry ' |
|
|
|
|
'WHERE name LIKE %s ESCAPE \'\\\'' |
|
|
|
|
) |
|
|
|
|
sql = 'SELECT bioentry_id FROM bioentry ' 'WHERE name LIKE %s ESCAPE \'\\\'' |
|
|
|
|
args = [_pattern_to_sql_pattern(keyword)] |
|
|
|
|
elif field == 'sv': |
|
|
|
|
if '.' in keyword: |
|
|
|
@ -368,21 +366,21 @@ class BioSqlAdapter(DatabaseAdapter): |
|
|
|
|
sql = ( |
|
|
|
|
'SELECT bioentry_id FROM bioentry ' |
|
|
|
|
'WHERE accession LIKE %s ESCAPE \'\\\'' |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
args = [_pattern_to_sql_pattern(accession)] |
|
|
|
|
else: |
|
|
|
|
sql = ( |
|
|
|
|
'SELECT bioentry_id FROM bioentry ' |
|
|
|
|
'WHERE (accession LIKE %s ESCAPE \'\\\' ' |
|
|
|
|
' AND version = %s)' |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
args = [_pattern_to_sql_pattern(accession), version] |
|
|
|
|
else: |
|
|
|
|
sql = ( |
|
|
|
|
'SELECT bioentry_id FROM bioentry ' |
|
|
|
|
'WHERE (accession LIKE %s ESCAPE \'\\\' ' |
|
|
|
|
' OR identifier LIKE %s ESCAPE \'\\\')' |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
pattern = _pattern_to_sql_pattern(keyword) |
|
|
|
|
args = [pattern, pattern] |
|
|
|
|
else: |
|
|
|
@ -401,7 +399,7 @@ class BioSqlAdapter(DatabaseAdapter): |
|
|
|
|
'SELECT bioentry_id FROM bioentry ' |
|
|
|
|
'WHERE (accession LIKE %s ESCAPE \'\\\' ' |
|
|
|
|
' OR name LIKE %s ESCAPE \'\\\')' |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
args = [pattern, pattern] |
|
|
|
|
|
|
|
|
|
if self.dbid is not None: |
|
|
|
@ -417,13 +415,13 @@ class BioSqlAdapter(DatabaseAdapter): |
|
|
|
|
'SELECT max(bioentry_id) FROM bioentry ' |
|
|
|
|
'WHERE biodatabase_id = %s ' |
|
|
|
|
'GROUP BY accession ORDER BY accession' |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
args = [self.dbid] |
|
|
|
|
else: |
|
|
|
|
sql = ( |
|
|
|
|
'SELECT max(bioentry_id) FROM bioentry ' |
|
|
|
|
'GROUP BY accession ORDER BY accession' |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
args = [] |
|
|
|
|
res = self.adaptor.execute_and_fetchall(sql, args) |
|
|
|
|
return [DBSeqRecord(self.adaptor, r[0]) for r in res] |
|
|
|
@ -431,10 +429,10 @@ class BioSqlAdapter(DatabaseAdapter): |
|
|
|
|
|
|
|
|
|
class ExpasyAdapter(DatabaseAdapter): |
|
|
|
|
"""Adapter for the ExPASy sequence server. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This adapter queries the ExPASy server to fetch sequences directly |
|
|
|
|
over the Internet. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Only queries by identifier (without wildcards) are supported. |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
@ -451,16 +449,16 @@ class ExpasyAdapter(DatabaseAdapter): |
|
|
|
|
|
|
|
|
|
class EntrezAdapter(DatabaseAdapter): |
|
|
|
|
"""Adapter for the NCBI E-Utilities. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This adapter queries the NCBI server to fetch sequences directly |
|
|
|
|
over the Internet. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Only queries by identifier (without wildcards) are supported. |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
def __init__(self, email, database): |
|
|
|
|
"""Creates a new Entrez adapter. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param email: the email address to pass on to the NCBI server |
|
|
|
|
along with any query |
|
|
|
|
:param database: the name of the Entrez database to query; can |
|
|
|
@ -472,8 +470,9 @@ class EntrezAdapter(DatabaseAdapter): |
|
|
|
|
|
|
|
|
|
def fetch(self, identifier): |
|
|
|
|
try: |
|
|
|
|
handle = Entrez.efetch(db=self.database, id=identifier, |
|
|
|
|
rettype='gb', retmode='text') |
|
|
|
|
handle = Entrez.efetch( |
|
|
|
|
db=self.database, id=identifier, rettype='gb', retmode='text' |
|
|
|
|
) |
|
|
|
|
except HTTPError as e: |
|
|
|
|
raise Error(f"Cannot fetch sequence {identifier}", e) |
|
|
|
|
|
|
|
|
|