Compare commits

...

3 Commits

Author SHA1 Message Date
Damien Goutte-Gattat e41c97fc19 Prepare 0.2.1 release. 2 years ago
Damien Goutte-Gattat 55e1605684 Only load the parsers if necessary. 2 years ago
Damien Goutte-Gattat 17265f4884 Apply changes from the Biopython pull request. 2 years ago
  1. 9
      NEWS
  2. 16
      README.md
  3. 16
      incenp/bio/seqio/GckIO.py
  4. 7
      incenp/bio/seqio/SnapGeneIO.py
  5. 40
      incenp/bio/seqio/XdnaIO.py
  6. 33
      incenp/bio/seqio/__init__.py
  7. 2
      setup.py

9
NEWS

@ -1,3 +1,12 @@
Changes in binseqs-0.2.1 (2019-08-??)
-------------------------------------
* Check for native Biopython support before loading the parsers.
* Always fill the SeqRecord's description field from free-form comment.
* Set the SeqRecord's name field to the comment's first word.
* Emit warnings when dropping data in XdnaWriter.
Changes in binseqs-0.2.0 (2019-07-29)
-------------------------------------

16
README.md

@ -6,6 +6,22 @@ framework from [Biopython](https://biopython.org/) by adding
support for some binary sequence formats.
Deprecation Warning
-------------------
This code has now been merged into Biopython. Starting from
Biopython release 1.75, all you need to do to support the formats
below is to load the `Bio.SeqIO` module.
Consequently, this project will no longer be maintained. It will
remain available online but will not be updated. All improvements
and bug fixes will occur in the Biopython repository.
You can still use this module until Biopython 1.75 is released and
available on your system. After that, loading the `incenp.bio.seqio`
module will be a no-op and a DeprecationWarning will be emitted.
Formats supported
-----------------

16
incenp/bio/seqio/GckIO.py

@ -107,11 +107,10 @@ def GckIterator(handle):
if strand == 1: # Reverse strand
strand = -1
elif strand == 2: # Forward strand
strand = 1
elif strand == 3: # Both strands
# Treated the same as a forward strand as BioPython does
# not seem to support dual-stranded features.
else:
# Other possible values are 0 (no strand specified),
# 2 (forward strand), and 3 (both strands). All are
# treated as a forward strand.
strand = 1
location = FeatureLocation(start, end, strand=strand)
@ -192,11 +191,8 @@ def GckIterator(handle):
# Read the construct's name
name = _read_pstring(handle)
if len(name) > 16 or ' ' in name:
# Store that as the record's description
record.description = name
else:
record.name = name
record.name = record.id = name.split(' ')[0]
record.description = name
# Circularity byte
# There may be other flags in that block, but their meaning

7
incenp/bio/seqio/SnapGeneIO.py

@ -106,6 +106,13 @@ def _parse_notes_segment(length, data, record):
if acc:
record.id = acc
comment = _get_child_value(xml, 'Comments')
if comment:
record.name = comment.split(' ', 1)[0]
record.description = comment
if not acc:
record.id = record.name
def _parse_file_description_segment(length, data, record):
cookie, seq_type, exp_version, imp_version = unpack('>8sHHH', data)

40
incenp/bio/seqio/XdnaIO.py

@ -144,13 +144,11 @@ def XdnaIterator(handle):
comment = _read(handle, com_length).decode('ASCII')
# Try to derive a name from the first "word" of the comment.
name = comment.split(' ', 2)[0]
if len(name) > 16:
name = None
name = comment.split(' ')[0]
# Create record object
record = SeqRecord(Seq(sequence, _seq_types[type]),
description=comment, name=name)
description=comment, name=name, id=name)
if topology in _seq_topologies:
record.annotations['topology'] = _seq_topologies[topology]
@ -196,31 +194,41 @@ class XdnaWriter(SequenceWriter):
else:
topology = 0
# We store the record's id and description in the comment field.
# Make sure to avoid duplicating the id if it is already
# contained in the description.
if record.description.startswith(record.id):
comment = record.description
else:
comment = '{} {}'.format(record.id, record.description)
# Write header
self.handle.write(pack('>BBB25xII60xI11xB',
0, # version
seqtype, topology, len(record),
0, # negative length
len(record.description),
len(comment),
255 # end of header
))
# Actual sequence and comment
self.handle.write(str(record.seq))
self.handle.write(record.description)
self.handle.write(str(record.seq).encode('ASCII'))
self.handle.write(comment.encode('ASCII'))
self.handle.write(pack('>B', 0)) # Annotation section marker
self._write_pstring('0') # right-side overhang
self._write_pstring('0') # left-side overhand
# Write features
self.handle.write(pack('>B', len(record.features)))
for feature in record.features:
if type(feature.location.start) != ExactPosition or type(feature.location.end) != ExactPosition:
# Cannot store fuzzy locations, skip feature
continue
# We must skip features with fuzzy locations as they cannot be
# represented in the Xdna format
features = [f for f in record.features if type(f.location.start) == ExactPosition and type(f.location.end) == ExactPosition]
# We also cannot store more than 255 features as the number of
# features is stored on a single byte...
if len(features) > 255:
features = features[:255]
self.handle.write(pack('>B', len(features)))
for feature in features:
self._write_pstring(feature.qualifiers.get('label', [''])[0])
description = ''
@ -252,5 +260,7 @@ class XdnaWriter(SequenceWriter):
def _write_pstring(self, s):
if len(s) > 255:
s = s[:255]
self.handle.write(pack('>B', len(s)))
self.handle.write(s)
self.handle.write(s.encode('ASCII'))

33
incenp/bio/seqio/__init__.py

@ -27,28 +27,25 @@ formats in Biopython's SeqIO.
from Bio import SeqIO
from . import SnapGeneIO
from . import XdnaIO
from . import GckIO
if not 'xdna' in SeqIO._FormatToIterator:
SeqIO._FormatToIterator['xdna'] = XdnaIO.XdnaIterator
if not 'gck' in SeqIO._FormatToIterator:
if not 'xdna' in SeqIO._FormatToWriter:
SeqIO._FormatToWriter['xdna'] = XdnaIO.XdnaWriter
from . import GckIO
from . import SnapGeneIO
from . import XdnaIO
if not 'xdna' in SeqIO._BinaryFormats:
SeqIO._BinaryFormats.append('xdna')
SeqIO._FormatToIterator['gck'] = GckIO.GckIterator
SeqIO._BinaryFormats.append('gck')
if not 'snapgene' in SeqIO._FormatToIterator:
SeqIO._FormatToIterator['snapgene'] = SnapGeneIO.SnapGeneIterator
if not 'snapgene' in SeqIO._BinaryFormats:
SeqIO._BinaryFormats.append('snapgene')
if not 'gck' in SeqIO._FormatToIterator:
SeqIO._FormatToIterator['gck'] = GckIO.GckIterator
SeqIO._FormatToIterator['xdna'] = XdnaIO.XdnaIterator
SeqIO._FormatToWriter['xdna'] = XdnaIO.XdnaWriter
SeqIO._BinaryFormats.append('xdna')
if not 'gck' in SeqIO._BinaryFormats:
SeqIO._BinaryFormats.append('gck')
else:
import warnings
warnings.warn("Your Biopython installation already has support for the "
"Gck, SnapGene, and Xdna formats. You no longer need to "
"load the incenp.bio.seqio module",
DeprecationWarning)

2
setup.py

@ -5,7 +5,7 @@ with open('README.md', 'r') as fh:
setup(
name = 'incenp.binseqs',
version = '0.2.0',
version = '0.2.1',
description = 'Support for binary sequence formats in Biopython',
long_description = long_description,
long_description_content_type = 'text/markdown',

Loading…
Cancel
Save