Browse Source

Initial commit.

This is binseqs, a Python package intended to provide support
for some binary sequence formats to the BioPython's SeqIO
framework.
snapgene-write
Damien Goutte-Gattat 3 years ago
commit
cfc49d724d
  1. 2
      .gitignore
  2. 1
      AUTHORS
  3. 17
      COPYING
  4. 0
      incenp/__init__.py
  5. 0
      incenp/bio/__init__.py
  6. 221
      incenp/bio/seqio/SnapGeneIO.py
  7. 156
      incenp/bio/seqio/XdnaIO.py
  8. 37
      incenp/bio/seqio/__init__.py
  9. 18
      setup.py

2
.gitignore

@ -0,0 +1,2 @@
*.pyc
build

1
AUTHORS

@ -0,0 +1 @@
Damien Goutte-Gattat

17
COPYING

@ -0,0 +1,17 @@
Redistribution and use of this script, with or without modifications, is
permitted provided that the following conditions are met:
1. Redistributions of this script must retain the above copyright
notice, this list of conditions and the following disclaimer.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

0
incenp/__init__.py

0
incenp/bio/__init__.py

221
incenp/bio/seqio/SnapGeneIO.py

@ -0,0 +1,221 @@
#!/usr/bin/env python
# binseqs - Support for binary sequence formats in Biopython
# Copyright (C) 2017 Damien Goutte-Gattat
#
# Redistribution and use of this script, with or without modifications,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of this script must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from struct import unpack
from xml.dom.minidom import parseString
from datetime import datetime
class _SegmentIterator:
def __init__(self, handle):
self.handle = handle
def __iter__(self):
return self
# Read a single segment from a SnapGene file.
# A SnapGene segment is a TLV-like structure comprising of:
# - 1 single byte indicating the segment's type;
# - 1 big-endian long integer (4bytes) indicating the length
# of the segment's data;
# - the segment's data.
def next(self):
type = self.handle.read(1)
if len(type) < 1: # No more segment
raise StopIteration
type = unpack('>B', type)[0]
length = self.handle.read(4)
if len(length) < 4:
raise ValueError("Unexpected end of segment")
length = unpack('>I', length)[0]
data = self.handle.read(length)
if len(data) < length:
raise ValueError("Unexpected end of segment")
return (type, length, data)
def _parse_dna_segment(length, data, record):
if record.seq:
raise ValueError("The file contains more than one DNA Segment")
flags, sequence = unpack(">B%ds" % (length - 1), data)
record.seq = Seq(sequence, alphabet=Alphabet.generic_dna)
if flags & 0x01:
record.annotations['topology'] = 'circular'
else:
record.annotations['topology'] = 'linear'
_months = [
'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
]
def _parse_notes_segment(length, data, record):
xml = parseString(data)
type = _get_child_value(xml, 'Type')
if type == 'Synthetic':
record.annotations['data_file_division'] = 'SYN'
else:
record.annotations['data_file_division'] = 'UNC'
date = _get_child_value(xml, 'LastModified')
if date:
d = datetime.strptime(date, '%Y.%m.%d')
record.annotations['date'] = "%02d-%s-%4d" % (d.day, _months[d.month - 1], d.year)
acc = _get_child_value(xml, 'AccessionNumber')
if acc:
record.id = acc
def _parse_file_description_segment(length, data, record):
cookie, seq_type, exp_version, imp_version = unpack('>8sHHH', data)
if cookie != 'SnapGene':
raise ValueError("The file is not a valid SnapGene file")
def _parse_features_segment(length, data, record):
xml = parseString(data)
for feature in xml.getElementsByTagName('Feature'):
quals = {}
type = _get_attribute_value(feature, 'type', default='misc_feature')
label = _get_attribute_value(feature, 'name')
if label:
quals['label'] = [label]
strand = +1
directionality = int(_get_attribute_value(feature, 'directionality', default="1"))
if directionality == 2:
strand = -1
location = None
for segment in feature.getElementsByTagName('Segment'):
rng = _get_attribute_value(segment, 'range')
start, end = [int(x) for x in rng.split('-')]
# Account for SnapGene's 1-based coordinates
start = start - 1
if not location:
location = FeatureLocation(start, end, strand=strand)
else:
location = location + FeatureLocation(start, end, strand=strand)
if not location:
raise ValueError("Missing feature location")
for qualifier in feature.getElementsByTagName('Q'):
qname = _get_attribute_value(qualifier, 'name',
error="Missing qualifier name")
qvalues = []
for value in qualifier.getElementsByTagName('V'):
if value.attributes.has_key('text'):
qvalues.append(value.attributes['text'].value)
elif value.attributes.has_key('predef'):
qvalues.append(value.attributes['predef'].value)
elif value.attributes.has_key('int'):
qvalues.append(int(value.attributes['int'].value))
quals[qname] = qvalues
feature = SeqFeature(location, type=type, qualifiers=quals)
record.features.append(feature)
def _parse_primers_segment(length, data, record):
xml = parseString(data)
for primer in xml.getElementsByTagName('primer'):
quals = {}
name = _get_attribute_value(primer, 'name')
if name:
quals['label'] = [name]
for site in primer.getElementsByTagName('BindingSite'):
rng = _get_attribute_value(site, 'location', error="Missing binding ite location")
start, end = [int(x) for x in rng.split('-')]
strand = int(_get_attribute_value(site, 'boundStrand', default="0"))
if strand == 1:
strand = -1
else:
strand = +1
feature = SeqFeature(FeatureLocation(start, end, strand=strand), type='primer_bind', qualifiers=quals)
record.features.append(feature)
_segment_handlers = {
0x00: _parse_dna_segment,
0x05: _parse_primers_segment,
0x06: _parse_notes_segment,
0x09: _parse_file_description_segment,
0x0A: _parse_features_segment
}
# Helper functions to process the XML data in
# some of the segments
def _get_attribute_value(node, name, default=None, error=None):
if node.attributes.has_key(name):
return node.attributes[name].value
elif error:
raise ValueError(error)
else:
return default
def _get_child_value(node, name, default=None, error=None):
children = node.getElementsByTagName(name)
if children and children[0].childNodes and children[0].firstChild.nodeType == node.TEXT_NODE:
return children[0].firstChild.data
elif error:
raise ValueError(error)
else:
return default
def SnapGeneIterator(handle):
record = SeqRecord(None)
n = 0
for type, length, data in _SegmentIterator(handle):
if n == 0 and type != 0x09:
raise ValueError("The file does not start with a File Description Segment")
if _segment_handlers.has_key(type):
_segment_handlers[type](length, data, record)
n = n + 1
if not record.seq:
raise ValueError("No DNA Segment in file")
yield record

156
incenp/bio/seqio/XdnaIO.py

@ -0,0 +1,156 @@
#!/usr/bin/env python
# binseqs - Support for binary sequence formats in Biopython
# Copyright (C) 2017 Damien Goutte-Gattat
#
# Redistribution and use of this script, with or without modifications,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of this script must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from struct import unpack
from re import match
_seq_types = {
0: Alphabet.generic_alphabet,
1: Alphabet.generic_dna,
2: Alphabet.generic_dna,
3: Alphabet.generic_rna,
4: Alphabet.generic_protein
}
_seq_topologies = {
0: 'linear',
1: 'circular'
}
def _read(handle, length):
data = handle.read(length)
if len(data) < length:
raise ValueError("Cannot read %d bytes from handle" % length)
return data
def _read_pstring(handle):
length = unpack('>B', _read(handle, 1))[0]
return unpack('%ds' % length, _read(handle, length))[0]
def _read_pstring_as_integer(handle):
return int(_read_pstring(handle))
def _read_overhang(handle):
# An overhang is represented in a XDNA file as:
# - a Pascal string containing the text representation of the
# overhang length (3' overhang if negative, 5' overhang if
# positive, no overhang if zero);
# - the actual overhang sequence.
length = _read_pstring_as_integer(handle)
if length != 0:
overhang = _read(handle, abs(length))
return (length, overhang)
else:
return (None, None)
def _parse_feature_description(desc, qualifiers):
# The 'description' field of a feature sometimes contains
# several GenBank-like qualifiers, separated by CR characters.
for part in [x for x in desc.split('\x0D') if len(x) > 0]:
m = match('^([^=]+)="([^"]+)"?$', part)
if m:
qual, value = m.groups()
qualifiers[qual] = [value]
elif not '"' in part: # Reject ill-formed qualifiers
qualifiers['note'] = [part]
def _read_feature(handle, record):
name = _read_pstring(handle)
desc = _read_pstring(handle)
type = _read_pstring(handle) or 'misc_feature'
start = _read_pstring_as_integer(handle)
end = _read_pstring_as_integer(handle)
# Feature flags (4 bytes):
# byte 1 is the strand (0: reverse strand, 1: forward strand);
# byte 2 tells whether to display the feature;
# byte 4 tells whether to draw an arrow when displaying the feature;
# meaning of byte 3 is unknown.
(forward, display, arrow) = unpack('>BBxB', _read(handle, 4))
if forward:
strand = 1
else:
strand = -1
start, end = end, start
# The last field is a Pascal string usually containing a
# comma-separated triplet of numbers ranging from 0 to 255.
# I suspect this represents the RGB color to use when displaying
# the feature. Skip it as we have no need for it.
_read_pstring(handle)
# Assemble the feature
# Shift start by -1 as XDNA feature coordinates are 1-based
# while Biopython uses 0-based couting.
location = FeatureLocation(start - 1, end, strand=strand)
qualifiers = {}
if name:
qualifiers['label'] = [name]
_parse_feature_description(desc, qualifiers)
feature = SeqFeature(location, type=type, qualifiers=qualifiers)
record.features.append(feature)
def XdnaIterator(handle):
# Parse fixed-size header and do some rudimentary checks
header = _read(handle, 112)
(version, type, topology, length, neg_length, com_length) = \
unpack('>BBB25xII60xI12x', header)
if version != 0:
raise ValueError("Unsupported XDNA version")
if not _seq_types.has_key(type):
raise ValueError("Unknown sequence type")
# Read actual sequence and comment found in all XDNA files
sequence = _read(handle, length)
comment = _read(handle, com_length)
# Try to derive a name from the first "word" of the comment.
name = comment.split(' ', 2)[0]
if len(name) > 16:
name = None
# Create record object
record = SeqRecord(Seq(sequence, _seq_types[type]),
description=comment, name=name)
if _seq_topologies.has_key(topology):
record.annotations['topology'] = _seq_topologies[topology]
if len(handle.read(1)) == 1:
# This is an XDNA file with an optional annotation section
# Skip the overhangs as I don't know how to represent
# them in the SeqRecord model...
_read_overhang(handle) # right-side overhang
_read_overhang(handle) # left-side overhang
# Read the features
num_features = unpack('>B', _read(handle, 1))[0]
while num_features > 0:
_read_feature(handle, record)
num_features -= 1
yield record

37
incenp/bio/seqio/__init__.py

@ -0,0 +1,37 @@
#!/usr/bin/env python
# binseqs - Support for binary sequence formats in Biopython
# Copyright (C) 2017 Damien Goutte-Gattat
#
# Redistribution and use of this script, with or without modifications,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of this script must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from Bio import SeqIO
from . import XdnaIO
from . import SnapGeneIO
if not SeqIO._FormatToIterator.has_key('xdna'):
SeqIO._FormatToIterator['xdna'] = XdnaIO.XdnaIterator
if not 'xdna' in SeqIO._BinaryFormats:
SeqIO._BinaryFormats.append('xdna')
if not SeqIO._FormatToIterator.has_key('snapgene'):
SeqIO._FormatToIterator['snapgene'] = SnapGeneIO.SnapGeneIterator
if not 'snapgene' in SeqIO._BinaryFormats:
SeqIO._BinaryFormats.append('snapgene')

18
setup.py

@ -0,0 +1,18 @@
#!/usr/bin/python
from distutils.core import setup
setup(name='BinSeqs',
version='0.1',
description='Support for binary sequence formats in Biopython',
author='Damien Goutte-Gattat',
author_email='dgouttegattat@incenp.org',
classifiers=['Development Status :: 1 - Planning',
'Environment :: Console',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: BSD License',
'Topic :: Scientific/Engineering :: Bio-Informatics'],
packages=['incenp',
'incenp.bio',
'incenp.bio.seqio']
)
Loading…
Cancel
Save