Browse Source

l/biopython: Patched to fix SnapGene parser.

current-20200608
Damien Goutte-Gattat 2 months ago
parent
commit
1cc1cc0e99
3 changed files with 112 additions and 1 deletions
  1. +40
    -0
      l/biopython/biopython-1.77-snapgene-utf8.diff
  2. +69
    -0
      l/biopython/biopython-1.77-snapgene-wrapping-features.diff
  3. +3
    -1
      l/biopython/biopython.SlackBuild

+ 40
- 0
l/biopython/biopython-1.77-snapgene-utf8.diff View File

@@ -0,0 +1,41 @@
commit 86ff00b49004091a95933167824088f7682ee698
Author: Damien Goutte-Gattat <dgouttegattat@incenp.org>
Date: Fri Jul 31 00:18:32 2020 +0100

SnapGene: Parse XML data as UTF8-encoded strings (#3180).
All XML chunks within SnapGene packets should be parsed as UTF-8,
as per the XML specification in the absence of an explicit encoding
attribute.

diff --git a/Bio/SeqIO/SnapGeneIO.py b/Bio/SeqIO/SnapGeneIO.py
index 9c8598b50..9464e682f 100644
--- a/Bio/SeqIO/SnapGeneIO.py
@@ -75,7 +75,7 @@ def _parse_notes_packet(length, data, record):
This type of packet contains some metadata about the sequence. They
are stored as a XML string with a 'Notes' root node.
"""
- xml = parseString(data.decode("ASCII"))
+ xml = parseString(data.decode("UTF-8"))
type = _get_child_value(xml, "Type")
if type == "Synthetic":
record.annotations["data_file_division"] = "SYN"
@@ -116,7 +116,7 @@ def _parse_features_packet(length, data, record):
which are in a dedicated Primers packet). The data is a XML string
starting with a 'Features' root node.
"""
- xml = parseString(data.decode("ASCII"))
+ xml = parseString(data.decode("UTF-8"))
for feature in xml.getElementsByTagName("Feature"):
quals = {}
@@ -170,7 +170,7 @@ def _parse_primers_packet(length, data, record):
stores primer binding features. The data is a XML string starting
with a 'Primers' root node.
"""
- xml = parseString(data.decode("ASCII"))
+ xml = parseString(data.decode("UTF-8"))
for primer in xml.getElementsByTagName("Primer"):
quals = {}

+ 69
- 0
l/biopython/biopython-1.77-snapgene-wrapping-features.diff View File

@@ -0,0 +1,70 @@
commit 119845697972a7ea3d6c2574dd5e2bdbe53b905e
Author: Damien Goutte-Gattat <dgouttegattat@incenp.org>
Date: Fri Jul 31 00:23:29 2020 +0100

SnapGene: Support features wrapping the origin.
A SnapGene file containing a circular sequence may contain feature
range specifications that wrap the beginning/end of the sequence,
e.g. "3925-2" for a feature starting at position 3925 near the end
of the sequence and ending at position 2. This case must be
recognized and represented as a CompoundLocation in the SeqRecord
object.

diff --git a/Bio/SeqIO/SnapGeneIO.py b/Bio/SeqIO/SnapGeneIO.py
index 9464e682f..ad1bd3258 100644
--- a/Bio/SeqIO/SnapGeneIO.py
@@ -109,6 +109,18 @@ def _parse_cookie_packet(length, data, record):
raise ValueError("The file is not a valid SnapGene file")
+def _parse_location(rangespec, strand, record):
+ start, end = [int(x) for x in rangespec.split("-")]
+ # Account for SnapGene's 1-based coordinates
+ start = start - 1
+ if start > end:
+ # Range wrapping the end of the sequence
+ location = FeatureLocation(start, len(record), strand=strand) + FeatureLocation(0, end, strand=strand)
+ else:
+ location = FeatureLocation(start, end, strand=strand)
+ return location
+
+
def _parse_features_packet(length, data, record):
"""Parse a sequence features packet.
@@ -135,13 +147,10 @@ def _parse_features_packet(length, data, record):
location = None
for segment in feature.getElementsByTagName("Segment"):
rng = _get_attribute_value(segment, "range")
- start, end = [int(x) for x in rng.split("-")]
- # Account for SnapGene's 1-based coordinates
- start = start - 1
if not location:
- location = FeatureLocation(start, end, strand=strand)
+ location = _parse_location(rng, strand, record)
else:
- location = location + FeatureLocation(start, end, strand=strand)
+ location = location + _parse_location(rng, strand, record)
if not location:
raise ValueError("Missing feature location")
@@ -182,8 +191,6 @@ def _parse_primers_packet(length, data, record):
rng = _get_attribute_value(
site, "location", error="Missing binding site location"
)
- start, end = [int(x) for x in rng.split("-")]
-
strand = int(_get_attribute_value(site, "boundStrand", default="0"))
if strand == 1:
strand = -1
@@ -191,7 +198,7 @@ def _parse_primers_packet(length, data, record):
strand = +1
feature = SeqFeature(
- FeatureLocation(start, end, strand=strand),
+ _parse_location(rng, strand, record),
type="primer_bind",
qualifiers=quals,
)

+ 3
- 1
l/biopython/biopython.SlackBuild View File

@@ -35,7 +35,7 @@ ARCHIVE=${ARCHIVE:-$NAMESRC-$VERSION.tar.gz}

# Build infos
NAMEPKG=${NAMEPKG:-biopython}
BUILD=${BUILD:-1GGD}
BUILD=${BUILD:-2GGD}
ARCH=${ARCH:-$(uname -m | sed 's/^i.86$/i486/;s/^arm.*/arm/')}
EXT=${EXT:-txz}

@@ -82,6 +82,8 @@ cd $TMP
echo "Building $ARCHIVE..."
tar xf $CWD/$ARCHIVE
cd $NAME
patch -p 1 < $CWD/biopython-1.77-snapgene-utf8.diff
patch -p 1 < $CWD/biopython-1.77-snapgene-wrapping-features.diff
CFLAGS=$CPUOPT \
CXXFLAGS=$CPUOPT \
python3 setup.py build


Loading…
Cancel
Save