[Biopython] EMBL DNA records with locations referencing other sequences

Adam Sjøgren asjo at koldfront.dk
Fri Oct 4 15:21:56 UTC 2019


Adam writes:

> If only days were 28 hours long...

How about something like this:

[PATCH] Add optional dictionary references to extract()

If the location refers to other records, those records can be supplied
in an optional references dictionary, where the records will be looked
up by the ref (key) and the value is expected to be the same as the
parent_sequence parameter.

Refs https://github.com/biopython/biopython/issues/808
---
 Bio/SeqFeature.py        | 25 +++++++++++++++++--------
 Tests/test_SeqFeature.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/Bio/SeqFeature.py b/Bio/SeqFeature.py
index 1a8e753..37957e9 100644
--- a/Bio/SeqFeature.py
+++ b/Bio/SeqFeature.py
@@ -298,7 +298,7 @@ class SeqFeature(object):
                           id=self.id,
                           qualifiers=OrderedDict(self.qualifiers.items()))
 
-    def extract(self, parent_sequence):
+    def extract(self, parent_sequence, references=None):
         """Extract the feature's sequence from supplied parent sequence.
 
         The parent_sequence can be a Seq like object or a string, and will
@@ -308,7 +308,9 @@ class SeqFeature(object):
         This should cope with complex locations including complements, joins
         and fuzzy positions. Even mixed strand features should work! This
         also covers features on protein sequences (e.g. domains), although
-        here reverse strand features are not permitted.
+        here reverse strand features are not permitted. If the
+        location refers to other records, they must be supplied in the
+        optional dictionary references.
 
         >>> from Bio.Seq import Seq
         >>> from Bio.Alphabet import generic_protein
@@ -335,7 +337,7 @@ class SeqFeature(object):
         if self.location is None:
             raise ValueError("The feature's .location is None. Check the "
                              "sequence file for a valid location.")
-        return self.location.extract(parent_sequence)
+        return self.location.extract(parent_sequence, references=references)
 
     def translate(self, parent_sequence, table="Standard", start_offset=None,
                   stop_symbol="*", to_stop=False, cds=None, gap=None):
@@ -1047,12 +1049,14 @@ class FeatureLocation(object):
                 return None
             raise
 
-    def extract(self, parent_sequence):
+    def extract(self, parent_sequence, references=None):
         """Extract the sequence from supplied parent sequence using the FeatureLocation object.
 
         The parent_sequence can be a Seq like object or a string, and will
         generally return an object of the same type. The exception to this is
         a MutableSeq as the parent sequence will return a Seq object.
+        If the location refers to other records, they must be supplied
+        in the optional dictionary references.
 
         >>> from Bio.Seq import Seq
         >>> from Bio.Alphabet import generic_protein
@@ -1064,8 +1068,11 @@ class FeatureLocation(object):
 
         """
         if self.ref or self.ref_db:
-            # TODO - Take a dictionary as an optional argument?
-            raise ValueError("Feature references another sequence.")
+            if not references:
+                raise ValueError("Feature references another sequence ({}), references mandatory".format(self.ref))
+            if not references.get(self.ref):
+                raise ValueError("Feature references another sequence ({}), not found in references".format(self.ref))
+            parent_sequence = references[self.ref]
         if isinstance(parent_sequence, MutableSeq):
             # This avoids complications with reverse complements
             # (the MutableSeq reverse complement acts in situ)
@@ -1462,12 +1469,14 @@ class CompoundLocation(object):
         """Not present in CompoundLocation, dummy method for API compatibility."""
         return None
 
-    def extract(self, parent_sequence):
+    def extract(self, parent_sequence, references=None):
         """Extract the sequence from supplied parent sequence using the CompoundLocation object.
 
         The parent_sequence can be a Seq like object or a string, and will
         generally return an object of the same type. The exception to this is
         a MutableSeq as the parent sequence will return a Seq object.
+        If the location refers to other records, they must be supplied
+        in the optional dictionary references.
 
         >>> from Bio.Seq import Seq
         >>> from Bio.Alphabet import generic_protein
@@ -1481,7 +1490,7 @@ class CompoundLocation(object):
 
         """
         # This copes with mixed strand features & all on reverse:
-        parts = [loc.extract(parent_sequence) for loc in self.parts]
+        parts = [loc.extract(parent_sequence, references=references) for loc in self.parts]
         # We use addition rather than a join to avoid alphabet issues:
         f_seq = parts[0]
         for part in parts[1:]:
diff --git a/Tests/test_SeqFeature.py b/Tests/test_SeqFeature.py
index 5655ff7..17f3cd8 100644
--- a/Tests/test_SeqFeature.py
+++ b/Tests/test_SeqFeature.py
@@ -10,7 +10,7 @@ import unittest
 
 from os import path
 
-from Bio import Seq, SeqIO
+from Bio import Seq, SeqIO, SeqRecord
 from Bio.Alphabet import generic_dna
 from Bio.Data.CodonTable import TranslationError
 from Bio.SeqFeature import FeatureLocation, AfterPosition, BeforePosition
@@ -223,6 +223,51 @@ class TestPositions(unittest.TestCase):
         self.assertEqual(oneof_pos.position_choices, oneof_pos2.position_choices)
 
 
+class TestExtract(unittest.TestCase):
+
+    def test_reference_in_location_record(self):
+        """Test location with reference to another record."""
+        parent_record = SeqRecord.SeqRecord(seq=Seq.Seq("actg"))
+        another_record = SeqRecord.SeqRecord(seq=Seq.Seq("gtcagctac"))
+        location = FeatureLocation(5, 8, ref="ANOTHER.7")
+        with self.assertRaises(ValueError) as err:
+            location.extract(parent_record)
+        assert "Feature references another sequence (ANOTHER.7), references mandatory" in str(err.exception)
+        with self.assertRaises(ValueError) as err:
+            location.extract(parent_record, references={"SOMEOTHER.2": another_record})
+        assert "Feature references another sequence (ANOTHER.7), not found in references" in str(err.exception)
+        self.assertEqual(str(location.extract(parent_record, references={"ANOTHER.7": another_record}).seq), "cta")
+
+    def test_reference_in_location_sequence(self):
+        """Test location with reference to another sequence."""
+        parent_sequence = Seq.Seq("actg")
+        another_sequence = Seq.Seq("gtcagctac")
+        location = FeatureLocation(5, 8, ref="ANOTHER.7")
+        self.assertEqual(str(location.extract(parent_sequence, references={"ANOTHER.7": another_sequence})), "cta")
+
+    def test_reference_in_compound_location_record(self):
+        "Test compound location with reference to another record."""
+        parent_record = SeqRecord.SeqRecord(Seq.Seq("aaccaaccaaccaaccaa"))
+        another_record = SeqRecord.SeqRecord(Seq.Seq("ttggttggttggttggtt"))
+        location = FeatureLocation(2, 6) + FeatureLocation(5, 8, ref="ANOTHER.7")
+        with self.assertRaises(ValueError) as err:
+            location.extract(parent_record)
+        assert "Feature references another sequence (ANOTHER.7), references mandatory" in str(err.exception)
+        with self.assertRaises(ValueError) as err:
+            location.extract(parent_record, references={"SOMEOTHER.2": another_record})
+        assert "Feature references another sequence (ANOTHER.7), not found in references" in str(err.exception)
+        self.assertEqual(str(location.extract(parent_record, references={"ANOTHER.7": another_record}).seq),
+                         "ccaa" + "tgg")
+
+    def test_reference_in_compound_location_sequence(self):
+        "Test compound location with reference to another sequence."""
+        parent_sequence = Seq.Seq("aaccaaccaaccaaccaa")
+        another_sequence = Seq.Seq("ttggttggttggttggtt")
+        location = FeatureLocation(2, 6) + FeatureLocation(5, 8, ref="ANOTHER.7")
+        self.assertEqual(str(location.extract(parent_sequence, references={"ANOTHER.7": another_sequence})),
+                         "ccaa" + "tgg")
+
+
 if __name__ == "__main__":
     runner = unittest.TextTestRunner(verbosity=2)
     unittest.main(testRunner=runner)
-- 
2.7.4

-- 
 "Another casualty of applied metaphysics."                   Adam Sjøgren
                                                         asjo at koldfront.dk



More information about the Biopython mailing list