Browse Source

Cope with non-ASCII and embedded HTML in features.

Feature qualifiers in some SnapGene files may contain embedded
HTML tags and non-ASCII characters. Get rid of the tags and
silently ignore non-ASCII Unicode characters.
snapgene-write
Damien Goutte-Gattat 4 years ago
parent
commit
358c18aa53
  1. 12
      incenp/bio/seqio/SnapGeneIO.py

12
incenp/bio/seqio/SnapGeneIO.py

@ -27,6 +27,7 @@ from Bio.SeqFeature import SeqFeature, FeatureLocation
from struct import unpack
from xml.dom.minidom import parseString
from datetime import datetime
from re import sub
class _SegmentIterator:
@ -136,9 +137,9 @@ def _parse_features_segment(length, data, record):
qvalues = []
for value in qualifier.getElementsByTagName('V'):
if value.attributes.has_key('text'):
qvalues.append(value.attributes['text'].value)
qvalues.append(_decode(value.attributes['text'].value))
elif value.attributes.has_key('predef'):
qvalues.append(value.attributes['predef'].value)
qvalues.append(_decode(value.attributes['predef'].value))
elif value.attributes.has_key('int'):
qvalues.append(int(value.attributes['int'].value))
quals[qname] = qvalues
@ -182,9 +183,12 @@ _segment_handlers = {
# Helper functions to process the XML data in
# some of the segments
def _decode(text):
return sub('<[^>]+>', '', text).encode('ascii', 'ignore')
def _get_attribute_value(node, name, default=None, error=None):
if node.attributes.has_key(name):
return node.attributes[name].value
return _decode(node.attributes[name].value)
elif error:
raise ValueError(error)
else:
@ -194,7 +198,7 @@ def _get_attribute_value(node, name, default=None, error=None):
def _get_child_value(node, name, default=None, error=None):
children = node.getElementsByTagName(name)
if children and children[0].childNodes and children[0].firstChild.nodeType == node.TEXT_NODE:
return children[0].firstChild.data
return _decode(children[0].firstChild.data)
elif error:
raise ValueError(error)
else:

Loading…
Cancel
Save