Some SlackBuild scripts for Slackware.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

883 lines
32 KiB

diff --git a/Bio/SeqIO/GckIO.py b/Bio/SeqIO/GckIO.py
new file mode 100644
index 000000000..f3e080354
--- /dev/null
+++ b/Bio/SeqIO/GckIO.py
@@ -0,0 +1,212 @@
+# Copyright 2019 Damien Goutte-Gattat. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Bio.SeqIO support for the "gck" file format.
+
+The GCK binary format is generated by the Gene Construction Kit software
+from Textco BioSoftware, Inc.
+"""
+
+from struct import unpack
+
+from Bio import Alphabet
+from Bio.Seq import Seq
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio.SeqRecord import SeqRecord
+
+
+def _read(handle, length):
+ """Read the specified number of bytes from the given handle."""
+ data = handle.read(length)
+ if len(data) < length:
+ raise ValueError("Cannot read {} bytes from handle".format(length))
+ return data
+
+
+def _read_packet(handle):
+ """Read a length-prefixed packet.
+
+ Parts of a GCK file are made of "packets" comprising of 4 bytes
+ giving the packet's size, followed by the packet's data.
+
+ There is no type tag. The type of a packet, and thus the type of data
+ it contains, is solely indicated by the position of the packet within
+ the GCK file.
+ """
+ length = _read(handle, 4)
+ length = unpack('>I', length)[0]
+ data = _read(handle, length)
+ return (data, length)
+
+
+def _read_pstring(handle):
+ """Read a Pascal string.
+
+ A Pascal string is one byte for length followed by the actual string.
+ """
+ length = _read(handle, 1)
+ length = unpack('>B', length)[0]
+ data = _read(handle, length).decode('ASCII')
+ return data
+
+
+def _read_p4string(handle):
+ """Read a 32-bit Pascal string.
+
+ Similar to a Pascal string but length is encoded on 4 bytes.
+ """
+ length = _read(handle, 4)
+ length = unpack('>I', length)[0]
+ data = _read(handle, length).decode('ASCII')
+ return data
+
+
+def GckIterator(handle):
+ """Parse a GCK file and return a SeqRecord object.
+
+ Note that a GCK file can only contain one sequence, so this
+ iterator will always return a single record.
+ """
+ # Skip file header
+ # GCK files start with a 24-bytes header. Bytes 4 and 8 seem to
+ # always be 12, maybe this could act as a magic cookie. Bytes
+ # 17-20 and 21-24 contain variable values of unknown meaning.
+ _read(handle, 24)
+
+ # Read the actual sequence data
+ packet, length = _read_packet(handle)
+ # The body of the sequence packet starts with a 32-bit integer
+ # representing the length of the sequence.
+ seq_length = unpack('>I', packet[:4])[0]
+ # This length should not be larger than the length of the
+ # sequence packet.
+ if seq_length > length - 4:
+ raise ValueError("Conflicting sequence length values")
+ sequence = packet[4:].decode('ASCII')
+ record = SeqRecord(Seq(sequence, alphabet=Alphabet.generic_dna))
+
+ # Skip unknown packet
+ _read_packet(handle)
+
+ # Read features packet
+ packet, length = _read_packet(handle)
+ (seq_length, num_features) = unpack('>IH', packet[:6])
+ # Check that length in the features packet matches the actual
+ # length of the sequence
+ if seq_length != len(record):
+ raise ValueError("Conflicting sequence length values")
+ # Each feature is stored in a 92-bytes structure.
+ if length - 6 != num_features * 92:
+ raise ValueError("Features packet size inconsistent with number of features")
+ for i in range(0, num_features):
+ offset = 6 + i * 92
+ feature_data = packet[offset:offset + 92]
+
+ # There's probably more stuff to unpack in that structure,
+ # but those values are the only ones I understand.
+ (start, end, type, strand, has_name, has_comment, version) = unpack('>II6xH14xB17xII35xB', feature_data)
+
+ if strand == 1: # Reverse strand
+ strand = -1
+ else:
+ # Other possible values are 0 (no strand specified),
+ # 2 (forward strand), and 3 (both strands). All are
+ # treated as a forward strand.
+ strand = 1
+ location = FeatureLocation(start, end, strand=strand)
+
+ # It looks like any value > 0 indicates a CDS...
+ if type > 0:
+ type = 'CDS'
+ else:
+ type = 'misc_feature'
+
+ # Each feature may have a name and a comment, which are then
+ # stored immediately after the features packet. Names are
+ # stored as Pascal strings (1 length byte followed by the
+ # string itself), comments are stored as "32-bit Pascal strings"
+ # (4 length bytes followed by the string).
+ qualifiers = {}
+ if has_name > 0:
+ name = _read_pstring(handle)
+ qualifiers['label'] = [name]
+ if has_comment > 0:
+ comment = _read_p4string(handle)
+ qualifiers['note'] = [comment]
+
+ # Each feature may exist in several "versions". We keep only
+ # the most recent version.
+ if version > 0:
+ continue
+
+ feature = SeqFeature(location, type=type, qualifiers=qualifiers)
+ record.features.append(feature)
+
+ # Read restriction sites packet
+ # We are not interested in restriction sites, but we must still read
+ # that packet so that we can skip the names and comments for each
+ # site, which are stored after that packet in a similar way as for
+ # the features above.
+ packet, length = _read_packet(handle)
+ (seq_length, num_sites) = unpack('>IH', packet[:6])
+ # Each site is stored in a 88-bytes structure
+ if length - 6 != num_sites * 88:
+ raise ValueError("Sites packet size inconsistent with number of sites")
+ for i in range(0, num_sites):
+ offset = 6 + i * 88
+ site_data = packet[offset:offset + 88]
+
+ (start, end, has_name, has_comment) = unpack('>II24xII48x', site_data)
+
+ # Skip names and comments
+ if has_name:
+ _read_pstring(handle)
+ if has_comment:
+ _read_p4string(handle)
+
+ # Skip unknown packet
+ _read_packet(handle)
+
+ # Next in the file are "version packets".
+ # However they are not properly formatted "packets" as they are not
+ # preceded by an integer giving their size. Instead we have a
+ # short integer indicating how many versions are there, and then
+ # as many 260-bytes block as we have versions.
+ num_versions = _read(handle, 2)
+ num_versions = unpack('>H', num_versions)[0]
+ versions = _read(handle, num_versions * 260)
+ for i in range(0, num_versions):
+ offset = i * 260
+ version_data = versions[offset:offset + 260]
+
+ # Each version may have a comment, which is then stored
+ # after all the "version packets".
+ has_comment = unpack('>I', version_data[-4:])[0]
+ if has_comment > 0:
+ _read_p4string(handle)
+
+ # Skip unknown fixed-size block
+ # Whatever this block contains, it is not preceded by any length
+ # indicator, so I hope its size is indeed constant in all files...
+ _read(handle, 706)
+
+ # Read the construct's name
+ name = _read_pstring(handle)
+ record.name = record.id = name.split(' ')[0]
+ record.description = name
+
+ # Circularity byte
+ # There may be other flags in that block, but their meaning
+ # is unknown to me.
+ flags = _read(handle, 17)
+ circularity = unpack('>16xB', flags)[0]
+ if circularity > 0:
+ record.annotations['topology'] = 'circular'
+ else:
+ record.annotations['topology'] = 'linear'
+
+ yield record
diff --git a/Bio/SeqIO/SnapGeneIO.py b/Bio/SeqIO/SnapGeneIO.py
new file mode 100644
index 000000000..aeb8bc292
--- /dev/null
+++ b/Bio/SeqIO/SnapGeneIO.py
@@ -0,0 +1,257 @@
+# Copyright 2017-2019 Damien Goutte-Gattat. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Bio.SeqIO support for the SnapGene file format.
+
+The SnapGene binary format is the native format used by the SnapGene program
+from GSL Biotech LLC.
+"""
+
+from datetime import datetime
+from re import sub
+from struct import unpack
+from xml.dom.minidom import parseString
+
+from Bio import Alphabet
+from Bio.Seq import Seq
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio.SeqRecord import SeqRecord
+
+
+class _PacketIterator:
+ """Iterate over the packets of a SnapGene file.
+
+ A SnapGene file is made of packets, each packet being a TLV-like
+ structure comprising:
+
+ - 1 single byte indicating the packet's type;
+ - 1 big-endian long integer (4 bytes) indicating the length of the
+ packet's data;
+ - the actual data.
+ """
+
+ def __init__(self, handle):
+ self.handle = handle
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ type = self.handle.read(1)
+ if len(type) < 1: # No more packet
+ raise StopIteration
+ type = unpack('>B', type)[0]
+
+ length = self.handle.read(4)
+ if len(length) < 4:
+ raise ValueError("Unexpected end of packet")
+ length = unpack('>I', length)[0]
+
+ data = self.handle.read(length)
+ if len(data) < length:
+ raise ValueError("Unexpected end of packet")
+
+ return (type, length, data)
+
+ # Python2 compatibility
+ def next(self):
+ return self.__next__()
+
+
+def _parse_dna_packet(length, data, record):
+ """Parse a DNA sequence packet.
+
+ A DNA sequence packet contains a single byte flag followed by the
+ sequence itself.
+ """
+ if record.seq:
+ raise ValueError("The file contains more than one DNA packet")
+
+ flags, sequence = unpack(">B%ds" % (length - 1), data)
+ record.seq = Seq(sequence.decode('ASCII'), alphabet=Alphabet.generic_dna)
+ if flags & 0x01:
+ record.annotations['topology'] = 'circular'
+ else:
+ record.annotations['topology'] = 'linear'
+
+
+def _parse_notes_packet(length, data, record):
+ """Parse a 'Notes' packet.
+
+ This type of packet contains some metadata about the sequence. They
+ are stored as a XML string with a 'Notes' root node.
+ """
+ xml = parseString(data.decode('ASCII'))
+ type = _get_child_value(xml, 'Type')
+ if type == 'Synthetic':
+ record.annotations['data_file_division'] = 'SYN'
+ else:
+ record.annotations['data_file_division'] = 'UNC'
+
+ date = _get_child_value(xml, 'LastModified')
+ if date:
+ record.annotations['date'] = datetime.strptime(date, '%Y.%m.%d')
+
+ acc = _get_child_value(xml, 'AccessionNumber')
+ if acc:
+ record.id = acc
+
+ comment = _get_child_value(xml, 'Comments')
+ if comment:
+ record.name = comment.split(' ', 1)[0]
+ record.description = comment
+ if not acc:
+ record.id = record.name
+
+
+def _parse_cookie_packet(length, data, record):
+ """Parse a SnapGene cookie packet.
+
+ Every SnapGene file starts with a packet of this type. It acts as
+ a magic cookie identifying the file as a SnapGene file.
+ """
+ cookie, seq_type, exp_version, imp_version = unpack('>8sHHH', data)
+ if cookie.decode('ASCII') != 'SnapGene':
+ raise ValueError("The file is not a valid SnapGene file")
+
+
+def _parse_features_packet(length, data, record):
+ """Parse a sequence features packet.
+
+ This packet stores sequence features (except primer binding sites,
+ which are in a dedicated Primers packet). The data is a XML string
+ starting with a 'Features' root node.
+ """
+ xml = parseString(data.decode('ASCII'))
+ for feature in xml.getElementsByTagName('Feature'):
+ quals = {}
+
+ type = _get_attribute_value(feature, 'type', default='misc_feature')
+ label = _get_attribute_value(feature, 'name')
+ if label:
+ quals['label'] = [label]
+
+ strand = +1
+ directionality = int(_get_attribute_value(feature, 'directionality', default="1"))
+ if directionality == 2:
+ strand = -1
+
+ location = None
+ for segment in feature.getElementsByTagName('Segment'):
+ rng = _get_attribute_value(segment, 'range')
+ start, end = [int(x) for x in rng.split('-')]
+ # Account for SnapGene's 1-based coordinates
+ start = start - 1
+ if not location:
+ location = FeatureLocation(start, end, strand=strand)
+ else:
+ location = location + FeatureLocation(start, end, strand=strand)
+ if not location:
+ raise ValueError("Missing feature location")
+
+ for qualifier in feature.getElementsByTagName('Q'):
+ qname = _get_attribute_value(qualifier, 'name',
+ error="Missing qualifier name")
+ qvalues = []
+ for value in qualifier.getElementsByTagName('V'):
+ if value.hasAttribute('text'):
+ qvalues.append(_decode(value.attributes['text'].value))
+ elif value.hasAttribute('predef'):
+ qvalues.append(_decode(value.attributes['predef'].value))
+ elif value.hasAttribute('int'):
+ qvalues.append(int(value.attributes['int'].value))
+ quals[qname] = qvalues
+
+ feature = SeqFeature(location, type=type, qualifiers=quals)
+ record.features.append(feature)
+
+
+def _parse_primers_packet(length, data, record):
+ """Parse a Primers packet.
+
+ A Primers packet is similar to a Features packet but specifically
+ stores primer binding features. The data is a XML string starting
+ with a 'Primers' root node.
+ """
+ xml = parseString(data.decode('ASCII'))
+ for primer in xml.getElementsByTagName('Primer'):
+ quals = {}
+
+ name = _get_attribute_value(primer, 'name')
+ if name:
+ quals['label'] = [name]
+
+ for site in primer.getElementsByTagName('BindingSite'):
+ rng = _get_attribute_value(site, 'location', error="Missing binding site location")
+ start, end = [int(x) for x in rng.split('-')]
+
+ strand = int(_get_attribute_value(site, 'boundStrand', default="0"))
+ if strand == 1:
+ strand = -1
+ else:
+ strand = +1
+
+ feature = SeqFeature(FeatureLocation(start, end, strand=strand), type='primer_bind', qualifiers=quals)
+ record.features.append(feature)
+
+
+_packet_handlers = {
+ 0x00: _parse_dna_packet,
+ 0x05: _parse_primers_packet,
+ 0x06: _parse_notes_packet,
+ 0x09: _parse_cookie_packet,
+ 0x0A: _parse_features_packet
+ }
+
+
+# Helper functions to process the XML data in
+# some of the segments
+
+def _decode(text):
+ # Get rid of HTML tags in some values
+ return sub('<[^>]+>', '', text)
+
+
+def _get_attribute_value(node, name, default=None, error=None):
+ if node.hasAttribute(name):
+ return _decode(node.attributes[name].value)
+ elif error:
+ raise ValueError(error)
+ else:
+ return default
+
+
+def _get_child_value(node, name, default=None, error=None):
+ children = node.getElementsByTagName(name)
+ if children and children[0].childNodes and children[0].firstChild.nodeType == node.TEXT_NODE:
+ return _decode(children[0].firstChild.data)
+ elif error:
+ raise ValueError(error)
+ else:
+ return default
+
+
+def SnapGeneIterator(handle):
+ """Parse a SnapGene file and return a SeqRecord object.
+
+ Note that a SnapGene file can only contain one sequence, so this
+ iterator will always return a single record.
+ """
+ record = SeqRecord(None)
+ n = 0
+
+ for n, (type, length, data) in enumerate(_PacketIterator(handle)):
+ if n == 0 and type != 0x09:
+ raise ValueError("The file does not start with a SnapGene cookie packet")
+
+ if type in _packet_handlers:
+ _packet_handlers[type](length, data, record)
+
+ if not record.seq:
+ raise ValueError("No DNA packet in file")
+
+ yield record
diff --git a/Bio/SeqIO/XdnaIO.py b/Bio/SeqIO/XdnaIO.py
new file mode 100644
index 000000000..2528468d3
--- /dev/null
+++ b/Bio/SeqIO/XdnaIO.py
@@ -0,0 +1,312 @@
+# Copyright 2017-2019 Damien Goutte-Gattat. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Bio.SeqIO support for the "xdna" file format.
+
+The Xdna binary format is generated by Christian Marck's DNA Strider program
+and also used by Serial Cloner.
+"""
+
+from re import match
+from struct import pack, unpack
+import warnings
+
+from Bio import Alphabet, BiopythonWarning
+from Bio.Seq import Seq
+from Bio.SeqIO.Interfaces import SequenceWriter
+from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition
+from Bio.SeqRecord import SeqRecord
+
+
+_seq_types = {
+ 0: Alphabet.generic_alphabet,
+ 1: Alphabet.generic_dna,
+ 2: Alphabet.generic_dna,
+ 3: Alphabet.generic_rna,
+ 4: Alphabet.generic_protein
+}
+
+_seq_topologies = {
+ 0: 'linear',
+ 1: 'circular'
+}
+
+
+def _read(handle, length):
+ """Read the specified number of bytes from the given handle."""
+ data = handle.read(length)
+ if len(data) < length:
+ raise ValueError("Cannot read %d bytes from handle" % length)
+ return data
+
+
+def _read_pstring(handle):
+ """Read a Pascal string.
+
+ A Pascal string comprises a single byte giving the length of the string
+ followed by as many bytes.
+ """
+ length = unpack('>B', _read(handle, 1))[0]
+ return unpack('%ds' % length, _read(handle, length))[0].decode('ASCII')
+
+
+def _read_pstring_as_integer(handle):
+ return int(_read_pstring(handle))
+
+
+def _read_overhang(handle):
+ """Read an overhang specification.
+
+ An overhang is represented in a XDNA file as:
+ - a Pascal string containing the text representation of the overhang
+ length, which also indicates the nature of the overhang:
+ - a length of zero means no overhang,
+ - a negative length means a 3' overhang,
+ - a positive length means a 5' overhang;
+ - the actual overhang sequence.
+
+ Examples:
+ - 0x01 0x30: no overhang ("0", as a P-string)
+ - 0x01 0x32 0x41 0x41: 5' AA overhang (P-string "2", then "AA")
+ - 0x02 0x2D 0x31 0x43: 3' C overhang (P-string "-1", then "C")
+
+ Returns a tuple (length, sequence).
+
+ """
+ length = _read_pstring_as_integer(handle)
+ if length != 0:
+ overhang = _read(handle, abs(length))
+ return (length, overhang)
+ else:
+ return (None, None)
+
+
+def _parse_feature_description(desc, qualifiers):
+ """Parse the description field of a Xdna feature.
+
+ The 'description' field of a feature sometimes contains several
+ GenBank-like qualifiers, separated by carriage returns (CR, 0x0D).
+ """
+ # Split the field's value in CR-separated lines, skipping empty lines
+ for line in [x for x in desc.split('\x0D') if len(x) > 0]:
+ # Is it a qualifier="value" line?
+ m = match('^([^=]+)="([^"]+)"?$', line)
+ if m:
+ # Store the qualifier as provided
+ qual, value = m.groups()
+ qualifiers[qual] = [value]
+ elif '"' not in line: # Reject ill-formed qualifiers
+ # Store the entire line as a generic note qualifier
+ qualifiers['note'] = [line]
+
+
+def _read_feature(handle, record):
+ """Read a single sequence feature."""
+ name = _read_pstring(handle)
+ desc = _read_pstring(handle)
+ type = _read_pstring(handle) or 'misc_feature'
+ start = _read_pstring_as_integer(handle)
+ end = _read_pstring_as_integer(handle)
+
+ # Feature flags (4 bytes):
+ # byte 1 is the strand (0: reverse strand, 1: forward strand);
+ # byte 2 tells whether to display the feature;
+ # byte 4 tells whether to draw an arrow when displaying the feature;
+ # meaning of byte 3 is unknown.
+ (forward, display, arrow) = unpack('>BBxB', _read(handle, 4))
+ if forward:
+ strand = 1
+ else:
+ strand = -1
+ start, end = end, start
+
+ # The last field is a Pascal string usually containing a
+ # comma-separated triplet of numbers ranging from 0 to 255.
+ # I suspect this represents the RGB color to use when displaying
+ # the feature. Skip it as we have no need for it.
+ _read_pstring(handle)
+
+ # Assemble the feature
+ # Shift start by -1 as XDNA feature coordinates are 1-based
+ # while Biopython uses 0-based couting.
+ location = FeatureLocation(start - 1, end, strand=strand)
+ qualifiers = {}
+ if name:
+ qualifiers['label'] = [name]
+ _parse_feature_description(desc, qualifiers)
+ feature = SeqFeature(location, type=type, qualifiers=qualifiers)
+ record.features.append(feature)
+
+
+def XdnaIterator(handle):
+ """Parse a Xdna file and return a SeqRecord object.
+
+ Note that this is an "iterator" in name only since a Xdna file always
+ contain a single sequence.
+ """
+ # Parse fixed-size header and do some rudimentary checks
+ #
+ # The "neg_length" value is the length of the part of the sequence
+ # before the nucleotide considered as the "origin" (nucleotide number 1,
+ # which in DNA Strider is not always the first nucleotide).
+ # Biopython's SeqRecord has no such concept of a sequence origin as far
+ # as I know, so we ignore that value. SerialCloner has no such concept
+ # either and always generates files with a neg_length of zero.
+ header = _read(handle, 112)
+ (version, type, topology, length, neg_length, com_length) = unpack('>BBB25xII60xI12x', header)
+ if version != 0:
+ raise ValueError("Unsupported XDNA version")
+ if type not in _seq_types:
+ raise ValueError("Unknown sequence type")
+
+ # Read actual sequence and comment found in all XDNA files
+ sequence = _read(handle, length).decode('ASCII')
+ comment = _read(handle, com_length).decode('ASCII')
+
+ # Try to derive a name from the first "word" of the comment
+ name = comment.split(' ')[0]
+
+ # Create record object
+ record = SeqRecord(Seq(sequence, _seq_types[type]),
+ description=comment, name=name, id=name)
+ if topology in _seq_topologies:
+ record.annotations['topology'] = _seq_topologies[topology]
+
+ if len(handle.read(1)) == 1:
+ # This is an XDNA file with an optional annotation section.
+
+ # Skip the overhangs as I don't know how to represent
+ # them in the SeqRecord model.
+ _read_overhang(handle) # right-side overhang
+ _read_overhang(handle) # left-side overhang
+
+ # Read the features
+ num_features = unpack('>B', _read(handle, 1))[0]
+ while num_features > 0:
+ _read_feature(handle, record)
+ num_features -= 1
+
+ yield record
+
+
+class XdnaWriter(SequenceWriter):
+ """Write files in the Xdna format."""
+
+ def write_file(self, records):
+ """Write the specified record to a Xdna file.
+
+ Note that the function expects a list of records as per the
+ SequenceWriter interface, but the list should contain only one
+ record as the Xdna format is a mono-record format.
+ """
+ if not records:
+ raise ValueError("Must have one sequence")
+ if len(records) > 1:
+ raise ValueError("More than one sequence found")
+
+ record = records[0]
+ self._has_truncated_strings = False
+
+ alptype = Alphabet._get_base_alphabet(record.seq.alphabet)
+ if isinstance(alptype, Alphabet.DNAAlphabet):
+ seqtype = 1
+ elif isinstance(alptype, Alphabet.RNAAlphabet):
+ seqtype = 3
+ elif isinstance(alptype, Alphabet.ProteinAlphabet):
+ seqtype = 4
+ else:
+ seqtype = 0
+
+ if record.annotations.get('topology', 'linear') == 'circular':
+ topology = 1
+ else:
+ topology = 0
+
+ # We store the record's id and description in the comment field.
+ # Make sure to avoid duplicating the id if it is already
+ # contained in the description.
+ if record.description.startswith(record.id):
+ comment = record.description
+ else:
+ comment = '{} {}'.format(record.id, record.description)
+
+ # Write header
+ self.handle.write(pack('>BBB25xII60xI11xB',
+ 0, # version
+ seqtype, topology, len(record),
+ 0, # negative length
+ len(comment),
+ 255 # end of header
+ ))
+
+ # Actual sequence and comment
+ self.handle.write(str(record.seq).encode('ASCII'))
+ self.handle.write(comment.encode('ASCII'))
+
+ self.handle.write(pack('>B', 0)) # Annotation section marker
+ self._write_pstring('0') # right-side overhang
+ self._write_pstring('0') # left-side overhand
+
+ # Write features
+ # We must skip features with fuzzy locations as they cannot be
+ # represented in the Xdna format
+ features = [f for f in record.features if type(f.location.start) == ExactPosition and type(f.location.end) == ExactPosition]
+ drop = len(record.features) - len(features)
+ if drop > 0:
+ warnings.warn("Dropping {} features with fuzzy locations".format(drop),
+ BiopythonWarning)
+
+ # We also cannot store more than 255 features as the number of
+ # features is stored on a single byte...
+ if len(features) > 255:
+ drop = len(features) - 255
+ warnings.warn("Too many features, dropping the last {}".format(drop),
+ BiopythonWarning)
+ features = features[:255]
+
+ self.handle.write(pack('>B', len(features)))
+ for feature in features:
+ self._write_pstring(feature.qualifiers.get('label', [''])[0])
+
+ description = ''
+ for qname in feature.qualifiers:
+ if qname in ('label', 'translation'):
+ continue
+
+ for val in feature.qualifiers[qname]:
+ if len(description) > 0:
+ description = description + '\x0D'
+ description = description + '%s="%s"' % (qname, val)
+ self._write_pstring(description)
+
+ self._write_pstring(feature.type)
+
+ start = feature.location.start.position + 1 # 1-based coordinates
+ end = feature.location.end.position
+ strand = 1
+ if feature.location.strand == -1:
+ start, end = end, start
+ strand = 0
+ self._write_pstring(str(start))
+ self._write_pstring(str(end))
+
+ self.handle.write(pack('>BBBB', strand, 1, 0, 1))
+ self._write_pstring('127,127,127')
+
+ if self._has_truncated_strings:
+ warnings.warn("Some annotations were truncated to 255 characters",
+ BiopythonWarning)
+
+ return 1
+
+ def _write_pstring(self, s):
+ """Write the given string as a Pascal string."""
+ if len(s) > 255:
+ self._has_truncated_strings = True
+ s = s[:255]
+ self.handle.write(pack('>B', len(s)))
+ self.handle.write(s.encode('ASCII'))
diff --git a/Bio/SeqIO/__init__.py b/Bio/SeqIO/__init__.py
index 69e9986da..2d8eb25d9 100644
--- a/Bio/SeqIO/__init__.py
+++ b/Bio/SeqIO/__init__.py
@@ -294,6 +294,7 @@ names are also used in Bio.AlignIO and include the following:
which encodes PHRED quality scores with an ASCII offset of 64
(not 33). Note as of version 1.8 of the CASAVA pipeline Illumina
will produce FASTQ files using the standard Sanger encoding.
+ - gck - Gene Construction Kit's format.
- genbank - The GenBank or GenPept flat file format.
- gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
- ig - The IntelliGenetics file format, apparently the same as the
@@ -315,6 +316,7 @@ names are also used in Bio.AlignIO and include the following:
- seqxml - SeqXML, simple XML format described in Schmitt et al (2011).
- sff - Standard Flowgram Format (SFF), typical output from Roche 454.
- sff-trim - Standard Flowgram Format (SFF) with given trimming applied.
+ - snapgene - SnapGene's native format.
- swiss - Plain text Swiss-Prot aka UniProt format.
- tab - Simple two column tab separated sequence files, where each
line holds a record's identifier and sequence. For example,
@@ -325,6 +327,7 @@ names are also used in Bio.AlignIO and include the following:
in separate FASTA files).
- uniprot-xml - The UniProt XML format (replacement for the SwissProt plain
text format which we call "swiss")
+ - xdna - DNA Strider's and SerialCloner's native format.
Note that while Bio.SeqIO can read all the above file formats, it cannot
write to all of them.
@@ -390,6 +393,7 @@ from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
from . import AbiIO
from . import AceIO
from . import FastaIO
+from . import GckIO
from . import IgIO # IntelliGenetics or MASE format
from . import InsdcIO # EMBL and GenBank
from . import NibIO
@@ -398,10 +402,12 @@ from . import PhdIO
from . import PirIO
from . import SeqXmlIO
from . import SffIO
+from . import SnapGeneIO
from . import SwissIO
from . import TabIO
from . import QualityIO # FastQ and qual files
from . import UniprotIO
+from . import XdnaIO
if sys.version_info < (3, 6):
from collections import OrderedDict as _dict
@@ -427,6 +433,7 @@ _FormatToIterator = {"abi": AbiIO.AbiIterator,
"embl": InsdcIO.EmblIterator,
"embl-cds": InsdcIO.EmblCdsFeatureIterator,
"gb": InsdcIO.GenBankIterator,
+ "gck": GckIO.GckIterator,
"genbank": InsdcIO.GenBankIterator,
"genbank-cds": InsdcIO.GenBankCdsFeatureIterator,
"imgt": InsdcIO.ImgtIterator,
@@ -444,11 +451,13 @@ _FormatToIterator = {"abi": AbiIO.AbiIterator,
"qual": QualityIO.QualPhredIterator,
"seqxml": SeqXmlIO.SeqXmlIterator,
"sff": SffIO.SffIterator,
+ "snapgene": SnapGeneIO.SnapGeneIterator,
# Not sure about this in the long run:
"sff-trim": SffIO._SffTrimIterator,
"swiss": SwissIO.SwissIterator,
"tab": TabIO.TabIterator,
"uniprot-xml": UniprotIO.UniprotIterator,
+ "xdna": XdnaIO.XdnaIterator
}
_FormatToString = {
@@ -481,9 +490,11 @@ _FormatToWriter = {"fasta": FastaIO.FastaWriter,
"seqxml": SeqXmlIO.SeqXmlWriter,
"sff": SffIO.SffWriter,
"tab": TabIO.TabWriter,
+ "xdna": XdnaIO.XdnaWriter
}
-_BinaryFormats = ["sff", "sff-trim", "abi", "abi-trim", "seqxml", "nib"]
+_BinaryFormats = ["sff", "sff-trim", "abi", "abi-trim", "gck", "seqxml",
+ "snapgene", "nib", "xdna"]
def write(sequences, handle, format):