5 Commits

Author SHA1 Message Date
Damien Goutte-Gattat e41c97fc19 Prepare 0.2.1 release. 2 years ago
Damien Goutte-Gattat 55e1605684 Only load the parsers if necessary. 2 years ago
Damien Goutte-Gattat 17265f4884 Apply changes from the Biopython pull request. 2 years ago
Damien Goutte-Gattat a4d90f68e5 Prepare next release. 2 years ago
Damien Goutte-Gattat e33d85666a Add support for the Gene Construction Kit format. 2 years ago
  1. 2
      MANIFEST.in
  2. 13
      NEWS
  3. 18
      README.md
  4. 207
      incenp/bio/seqio/GckIO.py
  5. 7
      incenp/bio/seqio/SnapGeneIO.py
  6. 40
      incenp/bio/seqio/XdnaIO.py
  7. 28
      incenp/bio/seqio/__init__.py
  8. 2
      setup.py

2
MANIFEST.in

@ -1 +1 @@
include AUTHORS LICENSE.txt README.md
include AUTHORS LICENSE.txt README.md NEWS

13
NEWS

@ -0,0 +1,13 @@
Changes in binseqs-0.2.1 (2019-08-??)
-------------------------------------
* Check for native Biopython support before loading the parsers.
* Always fill the SeqRecord's description field from free-form comment.
* Set the SeqRecord's name field to the comment's first word.
* Emit warnings when dropping data in XdnaWriter.
Changes in binseqs-0.2.0 (2019-07-29)
-------------------------------------
* Add read support for the GCK file format.

18
README.md

@ -6,6 +6,22 @@ framework from [Biopython](https://biopython.org/) by adding
support for some binary sequence formats.
Deprecation Warning
-------------------
This code has now been merged into Biopython. Starting from
Biopython release 1.75, all you need to do to support the formats
below is to load the `Bio.SeqIO` module.
Consequently, this project will no longer be maintained. It will
remain available online but will not be updated. All improvements
and bug fixes will occur in the Biopython repository.
You can still use this module until Biopython 1.75 is released and
available on your system. After that, loading the `incenp.bio.seqio`
module will be a no-op and a DeprecationWarning will be emitted.
Formats supported
-----------------
@ -14,6 +30,8 @@ Formats supported
reading and writing supported
* `snapgene` format, used by [SnapGene](https://www.snapgene.com/):
reading support only
* `gck` format, used by [Gene Construction Kit](http://www.textco.com/gene-construction-kit.php):
reading support only
Usage

207
incenp/bio/seqio/GckIO.py

@ -0,0 +1,207 @@
# -*- coding: utf-8 -*-
# Copyright © 2019 Damien Goutte-Gattat
#
# Redistribution and use of this script, with or without modifications,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of this script must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Gene Construction kit (GCK) format support for Biopython
Provide read support for the binary format used by Gene Construction Kit.
"""
from struct import unpack
from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
def _read(handle, length):
data = handle.read(length)
if len(data) < length:
raise ValueError("Cannot read {} bytes from handle".format(length))
return data
def _read_packet(handle):
length = _read(handle, 4)
length = unpack('>I', length)[0]
data = _read(handle, length)
return (data, length)
def _read_pstring(handle):
"""Read a Pascal string.
A Pascal string is one byte for length followed by the actual string."""
length = _read(handle, 1)
length = unpack('>B', length)[0]
data = _read(handle, length).decode('ASCII')
return data
def _read_p4string(handle):
"""Read a 32-bit Pascal string.
Similar to a Pascal string but length is encoded on 4 bytes."""
length = _read(handle, 4)
length = unpack('>I', length)[0]
data = _read(handle, length).decode('ASCII')
return data
def GckIterator(handle):
# Skip file header
# GCK files start with a 24-bytes header. Bytes 4 and 8 seem to
# always be 12, maybe this could act as a magic cookie. Bytes
# 17-20 and 21-24 contain variable values of unknown meaning.
_read(handle, 24)
# Read the actual sequence data
packet, length = _read_packet(handle)
# The body of the sequence packet starts with a 32-bit integer
# representing the length of the sequence.
seq_length = unpack('>I', packet[:4])[0]
# This length should not be larger than the length of the
# sequence packet.
if seq_length > length - 4:
raise ValueError("Conflicting sequence length values.")
sequence = packet[4:].decode('ASCII')
record = SeqRecord(Seq(sequence, alphabet=Alphabet.generic_dna))
# Skip unknown packet
_read_packet(handle)
# Read features packet
packet, length = _read_packet(handle)
(seq_length, num_features) = unpack('>IH', packet[:6])
# Check that length in the features packet matches the actual
# length of the sequence
if seq_length != len(record):
raise ValueError("Conflicting sequence length values.")
# Each feature is stored in a 92-bytes structure.
if length - 6 != num_features * 92:
raise ValueError("Features packet size inconsistent with number of features.")
for i in range(0, num_features):
offset = 6 + i * 92
feature_data = packet[offset:offset+92]
# There's probably more stuff to unpack in that structure,
# but those values are the only ones I understand.
(start, end, type, strand, has_name, has_comment, version) = \
unpack('>II6xH14xB17xII35xB', feature_data)
if strand == 1: # Reverse strand
strand = -1
else:
# Other possible values are 0 (no strand specified),
# 2 (forward strand), and 3 (both strands). All are
# treated as a forward strand.
strand = 1
location = FeatureLocation(start, end, strand=strand)
# It looks like any value > 0 indicates a CDS...
if type > 0:
type = 'CDS'
else:
type = 'misc_feature'
# Each feature may have a name and a comment, which are then
# stored immediately after the features packet. Names are
# stored as Pascal strings (1 length byte followed by the
# string itself), comments are stored as "32-bit Pascal strings"
# (4 length bytes followed by the string).
qualifiers = {}
if has_name > 0:
name = _read_pstring(handle)
qualifiers['label'] = [name]
if has_comment > 0:
comment = _read_p4string(handle)
qualifiers['note'] = [comment]
# Each feature may exist in several "versions". We keep only
# the most recent version.
if version > 0:
continue
feature = SeqFeature(location, type=type, qualifiers=qualifiers)
record.features.append(feature)
# Read restriction sites packet
# We are not interested in restriction sites, but we must still read
# that packet so that we can skip the names and comments for each
# site, which are stored after that packet in a similar way as for
# the features above.
packet, length = _read_packet(handle)
(seq_length, num_sites) = unpack('>IH', packet[:6])
# Each site is stored in a 88-bytes structure
if length - 6 != num_sites * 88:
raise ValueError("Sites packet size inconsistent with number of sites")
for i in range(0, num_sites):
offset = 6 + i * 88
site_data = packet[offset:offset+88]
(start, end, has_name, has_comment) = unpack('>II24xII48x', site_data)
# Skip names and comments
if has_name:
_read_pstring(handle)
if has_comment:
_read_p4string(handle)
# Skip unknown packet
_read_packet(handle)
# Next in the file are "version packets".
# However they are not properly formatted "packets" as they are not
# preceded by an integer giving their size. Instead we have a
# short integer indicating how many versions are there, and then
# as many 260-bytes block as we have versions.
num_versions = _read(handle, 2)
num_versions = unpack('>H', num_versions)[0]
versions = _read(handle, num_versions * 260)
for i in range(0, num_versions):
offset = i * 260
version_data = versions[offset:offset+260]
# Each version may have a comment, which is then stored
# after all the "version packets".
has_comment = unpack('>I', version_data[-4:])[0]
if has_comment > 0:
_read_p4string(handle)
# Skip unknown fixed-size block
# Whatever this block contains, it is not preceded by any length
# indicator, so I hope its size is indeed constant in all files...
_read(handle, 706)
# Read the construct's name
name = _read_pstring(handle)
record.name = record.id = name.split(' ')[0]
record.description = name
# Circularity byte
# There may be other flags in that block, but their meaning
# is unknown to me.
flags = _read(handle, 17)
circularity = unpack('>16xB', flags)[0]
if circularity > 0:
record.annotations['topology'] = 'circular'
else:
record.annotations['topology'] = 'linear'
yield record

7
incenp/bio/seqio/SnapGeneIO.py

@ -106,6 +106,13 @@ def _parse_notes_segment(length, data, record):
if acc:
record.id = acc
comment = _get_child_value(xml, 'Comments')
if comment:
record.name = comment.split(' ', 1)[0]
record.description = comment
if not acc:
record.id = record.name
def _parse_file_description_segment(length, data, record):
cookie, seq_type, exp_version, imp_version = unpack('>8sHHH', data)

40
incenp/bio/seqio/XdnaIO.py

@ -144,13 +144,11 @@ def XdnaIterator(handle):
comment = _read(handle, com_length).decode('ASCII')
# Try to derive a name from the first "word" of the comment.
name = comment.split(' ', 2)[0]
if len(name) > 16:
name = None
name = comment.split(' ')[0]
# Create record object
record = SeqRecord(Seq(sequence, _seq_types[type]),
description=comment, name=name)
description=comment, name=name, id=name)
if topology in _seq_topologies:
record.annotations['topology'] = _seq_topologies[topology]
@ -196,31 +194,41 @@ class XdnaWriter(SequenceWriter):
else:
topology = 0
# We store the record's id and description in the comment field.
# Make sure to avoid duplicating the id if it is already
# contained in the description.
if record.description.startswith(record.id):
comment = record.description
else:
comment = '{} {}'.format(record.id, record.description)
# Write header
self.handle.write(pack('>BBB25xII60xI11xB',
0, # version
seqtype, topology, len(record),
0, # negative length
len(record.description),
len(comment),
255 # end of header
))
# Actual sequence and comment
self.handle.write(str(record.seq))
self.handle.write(record.description)
self.handle.write(str(record.seq).encode('ASCII'))
self.handle.write(comment.encode('ASCII'))
self.handle.write(pack('>B', 0)) # Annotation section marker
self._write_pstring('0') # right-side overhang
self._write_pstring('0') # left-side overhand
# Write features
self.handle.write(pack('>B', len(record.features)))
for feature in record.features:
if type(feature.location.start) != ExactPosition or type(feature.location.end) != ExactPosition:
# Cannot store fuzzy locations, skip feature
continue
# We must skip features with fuzzy locations as they cannot be
# represented in the Xdna format
features = [f for f in record.features if type(f.location.start) == ExactPosition and type(f.location.end) == ExactPosition]
# We also cannot store more than 255 features as the number of
# features is stored on a single byte...
if len(features) > 255:
features = features[:255]
self.handle.write(pack('>B', len(features)))
for feature in features:
self._write_pstring(feature.qualifiers.get('label', [''])[0])
description = ''
@ -252,5 +260,7 @@ class XdnaWriter(SequenceWriter):
def _write_pstring(self, s):
if len(s) > 255:
s = s[:255]
self.handle.write(pack('>B', len(s)))
self.handle.write(s)
self.handle.write(s.encode('ASCII'))

28
incenp/bio/seqio/__init__.py

@ -27,21 +27,25 @@ formats in Biopython's SeqIO.
from Bio import SeqIO
from . import SnapGeneIO
from . import XdnaIO
if not 'gck' in SeqIO._FormatToIterator:
from . import GckIO
from . import SnapGeneIO
from . import XdnaIO
if not 'xdna' in SeqIO._FormatToIterator:
SeqIO._FormatToIterator['xdna'] = XdnaIO.XdnaIterator
SeqIO._FormatToIterator['gck'] = GckIO.GckIterator
SeqIO._BinaryFormats.append('gck')
if not 'xdna' in SeqIO._FormatToWriter:
SeqIO._FormatToWriter['xdna'] = XdnaIO.XdnaWriter
SeqIO._FormatToIterator['snapgene'] = SnapGeneIO.SnapGeneIterator
SeqIO._BinaryFormats.append('snapgene')
if not 'xdna' in SeqIO._BinaryFormats:
SeqIO._FormatToIterator['xdna'] = XdnaIO.XdnaIterator
SeqIO._FormatToWriter['xdna'] = XdnaIO.XdnaWriter
SeqIO._BinaryFormats.append('xdna')
if not 'snapgene' in SeqIO._FormatToIterator:
SeqIO._FormatToIterator['snapgene'] = SnapGeneIO.SnapGeneIterator
if not 'snapgene' in SeqIO._BinaryFormats:
SeqIO._BinaryFormats.append('snapgene')
else:
import warnings
warnings.warn("Your Biopython installation already has support for the "
"Gck, SnapGene, and Xdna formats. You no longer need to "
"load the incenp.bio.seqio module",
DeprecationWarning)

2
setup.py

@ -5,7 +5,7 @@ with open('README.md', 'r') as fh:
setup(
name = 'incenp.binseqs',
version = '0.1.0',
version = '0.2.1',
description = 'Support for binary sequence formats in Biopython',
long_description = long_description,
long_description_content_type = 'text/markdown',

Loading…
Cancel
Save