| Trees | Indices | Help |
|---|
|
|
1 # Copyright 1999 by Jeffrey Chang. All rights reserved.
2 # This code is part of the Biopython distribution and governed by its
3 # license. Please see the LICENSE file that should have been included
4 # as part of this package.
5
6 """
7 This module provides code to work with the sprotXX.dat file from
8 Utilities for working with FASTA-formatted sequences (DEPRECATED).
9 http://www.expasy.ch/sprot/sprot-top.html
10
11 Please see Bio.SwissProt for alternatives for the functionality in this module.
12
13 Tested with:
14 Release 37, Release 38, Release 39
15
16 Limited testing with:
17 Release 51, 54
18
19
20 Classes:
21 Record Holds SwissProt data.
22 Reference Holds reference data from a SwissProt entry.
23 Iterator Iterates over entries in a SwissProt file.
24 Dictionary Accesses a SwissProt file using a dictionary interface.
25 RecordParser Parses a SwissProt record into a Record object.
26 SequenceParser Parses a SwissProt record into a SeqRecord object.
27
28 _Scanner Scans SwissProt-formatted data.
29 _RecordConsumer Consumes SwissProt data to a SProt.Record object.
30 _SequenceConsumer Consumes SwissProt data to a SeqRecord object.
31
32
33 Functions:
34 index_file Index a SwissProt file for a Dictionary.
35
36 """
37 import warnings
38 warnings.warn("Bio.SwissProt.SProt is deprecated. Please use the functions Bio.SwissProt.parse or Bio.SwissProt.read if you want to get a SwissProt.Record, or Bio.SeqIO.parse or Bio.SeqIO.read if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).",
39 DeprecationWarning)
40
41 from types import *
42 import os
43 from Bio import File
44 from Bio import Index
45 from Bio import Alphabet
46 from Bio import Seq
47 from Bio import SeqRecord
48 from Bio.ParserSupport import *
49
50 # The parse(), read() functions can probably be simplified if we don't
51 # use the "parser = RecordParser(); parser.parse(handle)" approach.
53 from SProt import RecordParser
54 import cStringIO
55 parser = RecordParser()
56 text = ""
57 for line in handle:
58 text += line
59 if line[:2]=='//':
60 handle = cStringIO.StringIO(text)
61 record = parser.parse(handle)
62 text = ""
63 yield record
64
66 from SProt import RecordParser
67 parser = RecordParser()
68 try:
69 record = parser.parse(handle)
70 except ValueError, error:
71 if error.message.startswith("Line does not start with 'ID':"):
72 raise ValueError("No SwissProt record found")
73 else:
74 raise error
75 # We should have reached the end of the record by now
76 remainder = handle.read()
77 if remainder:
78 raise ValueError("More than one SwissProt record found")
79 return record
80
81
82 _CHOMP = " \n\r\t.,;" #whitespace and trailing punctuation
83
85 """Holds information from a SwissProt record.
86
87 Members:
88 entry_name Name of this entry, e.g. RL1_ECOLI.
89 data_class Either 'STANDARD' or 'PRELIMINARY'.
90 molecule_type Type of molecule, 'PRT',
91 sequence_length Number of residues.
92
93 accessions List of the accession numbers, e.g. ['P00321']
94 created A tuple of (date, release).
95 sequence_update A tuple of (date, release).
96 annotation_update A tuple of (date, release).
97
98 description Free-format description.
99 gene_name Gene name. See userman.txt for description.
100 organism The source of the sequence.
101 organelle The origin of the sequence.
102 organism_classification The taxonomy classification. List of strings.
103 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
104 taxonomy_id A list of NCBI taxonomy id's.
105 host_organism A list of NCBI taxonomy id's of the hosts of a virus,
106 if any.
107 references List of Reference objects.
108 comments List of strings.
109 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
110 keywords List of the keywords.
111 features List of tuples (key name, from, to, description).
112 from and to can be either integers for the residue
113 numbers, '<', '>', or '?'
114
115 seqinfo tuple of (length, molecular weight, CRC32 value)
116 sequence The sequence.
117
118 """
120 self.entry_name = None
121 self.data_class = None
122 self.molecule_type = None
123 self.sequence_length = None
124
125 self.accessions = []
126 self.created = None
127 self.sequence_update = None
128 self.annotation_update = None
129
130 self.description = ''
131 self.gene_name = ''
132 self.organism = ''
133 self.organelle = ''
134 self.organism_classification = []
135 self.taxonomy_id = []
136 self.host_organism = []
137 self.references = []
138 self.comments = []
139 self.cross_references = []
140 self.keywords = []
141 self.features = []
142
143 self.seqinfo = None
144 self.sequence = ''
145
147 """Holds information from 1 references in a SwissProt entry.
148
149 Members:
150 number Number of reference in an entry.
151 positions Describes extent of work. list of strings.
152 comments Comments. List of (token, text).
153 references References. List of (dbname, identifier)
154 authors The authors of the work.
155 title Title of the work.
156 location A citation for the work.
157
158 """
167
168
170 """Accesses a SwissProt file using a dictionary interface.
171
172 """
173 __filename_key = '__filename'
174
176 """__init__(self, indexname, parser=None)
177
178 Open a SwissProt Dictionary. indexname is the name of the
179 index for the dictionary. The index should have been created
180 using the index_file function. parser is an optional Parser
181 object to change the results into another form. If set to None,
182 then the raw contents of the file will be returned.
183
184 """
185 self._index = Index.Index(indexname)
186 self._handle = open(self._index[self.__filename_key])
187 self._parser = parser
188
190 return len(self._index)
191
193 start, len = self._index[key]
194 self._handle.seek(start)
195 data = self._handle.read(len)
196 if self._parser is not None:
197 return self._parser.parse(File.StringHandle(data))
198 return data
199
202
204 # I only want to expose the keys for SwissProt.
205 k = self._index.keys()
206 k.remove(self.__filename_key)
207 return k
208
209
221
223 """Parses SwissProt data into a standard SeqRecord object.
224 """
226 """Initialize a SequenceParser.
227
228 Arguments:
229 o alphabet - The alphabet to use for the generated Seq objects. If
230 not supplied this will default to the generic protein alphabet.
231 """
232 self._scanner = _Scanner()
233 self._consumer = _SequenceConsumer(alphabet)
234
238
240 """Scans SwissProt-formatted data.
241
242 Tested with:
243 Release 37
244 Release 38
245 """
246
248 """feed(self, handle, consumer)
249
250 Feed in SwissProt data for scanning. handle is a file-like
251 object that contains swissprot data. consumer is a
252 Consumer object that will receive events as the report is scanned.
253
254 """
255 if isinstance(handle, File.UndoHandle):
256 uhandle = handle
257 else:
258 uhandle = File.UndoHandle(handle)
259 self._scan_record(uhandle, consumer)
260
262 """Ignores any lines starting **"""
263 #See Bug 2353, some files from the EBI have extra lines
264 #starting "**" (two asterisks/stars), usually between the
265 #features and sequence but not all the time. They appear
266 #to be unofficial automated annotations. e.g.
267 #**
268 #** ################# INTERNAL SECTION ##################
269 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
270 while "**" == uhandle.peekline()[:2]:
271 skip = uhandle.readline()
272 #print "Skipping line: %s" % skip.rstrip()
273
275 consumer.start_record()
276 for fn in self._scan_fns:
277 self._skip_starstar(uhandle)
278 fn(self, uhandle, consumer)
279
280 # In Release 38, ID N33_HUMAN has a DR buried within comments.
281 # Check for this and do more comments, if necessary.
282 # XXX handle this better
283 if fn is self._scan_dr.im_func:
284 self._scan_cc(uhandle, consumer)
285 self._scan_dr(uhandle, consumer)
286 consumer.end_record()
287
288 - def _scan_line(self, line_type, uhandle, event_fn,
289 exactly_one=None, one_or_more=None, any_number=None,
290 up_to_one=None):
291 # Callers must set exactly one of exactly_one, one_or_more, or
292 # any_number to a true value. I do not explicitly check to
293 # make sure this function is called correctly.
294
295 # This does not guarantee any parameter safety, but I
296 # like the readability. The other strategy I tried was have
297 # parameters min_lines, max_lines.
298
299 if exactly_one or one_or_more:
300 read_and_call(uhandle, event_fn, start=line_type)
301 if one_or_more or any_number:
302 while 1:
303 if not attempt_read_and_call(uhandle, event_fn,
304 start=line_type):
305 break
306 if up_to_one:
307 attempt_read_and_call(uhandle, event_fn, start=line_type)
308
311
313 # Until release 38, this used to match exactly_one.
314 # However, in release 39, 1A02_HUMAN has 2 AC lines, and the
315 # definition needed to be expanded.
316 self._scan_line('AC', uhandle, consumer.accession, any_number=1)
317
319 self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
320 self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
321 # IPI doesn't necessarily contain the third line about annotations
322 self._scan_line('DT', uhandle, consumer.date, up_to_one=1)
323
325 # IPI can be missing a DE line
326 self._scan_line('DE', uhandle, consumer.description, any_number=1)
327
330
334
337
341
345
347 # viral host organism. introduced after SwissProt 39.
348 self._scan_line('OH', uhandle, consumer.organism_host, any_number=1)
349
351 while True:
352 if safe_peekline(uhandle)[:2] != 'RN':
353 break
354 self._scan_rn(uhandle, consumer)
355 self._scan_rp(uhandle, consumer)
356 self._scan_rc(uhandle, consumer)
357 self._scan_rx(uhandle, consumer)
358 # ws:2001-12-05 added, for record with RL before RA
359 self._scan_rl(uhandle, consumer)
360 self._scan_ra(uhandle, consumer)
361 #EBI copy of P72010 is missing the RT line, and has one
362 #of their ** lines in its place noting "** /NO TITLE."
363 #See also bug 2353
364 self._skip_starstar(uhandle)
365 self._scan_rt(uhandle, consumer)
366 self._scan_rl(uhandle, consumer)
367
371
375
379
383
385 # In UniProt release 1.12 of 6/21/04, there is a new RG
386 # (Reference Group) line, which references a group instead of
387 # an author. Each block must have at least 1 RA or RG line.
388 self._scan_line('RA', uhandle, consumer.reference_author,
389 any_number=1)
390 self._scan_line('RG', uhandle, consumer.reference_author,
391 any_number=1)
392 # PRKN_HUMAN has RG lines, then RA lines. The best solution
393 # is to write code that accepts either of the line types.
394 # This is the quick solution...
395 self._scan_line('RA', uhandle, consumer.reference_author,
396 any_number=1)
397
401
403 # This was one_or_more, but P82909 in TrEMBL 16.0 does not
404 # have one.
405 self._scan_line('RL', uhandle, consumer.reference_location,
406 any_number=1)
407
410
414
417
420
423
426
429
432
433 _scan_fns = [
434 _scan_id,
435 _scan_ac,
436 _scan_dt,
437 _scan_de,
438 _scan_gn,
439 _scan_os,
440 _scan_og,
441 _scan_oc,
442 _scan_ox,
443 _scan_oh,
444 _scan_reference,
445 _scan_cc,
446 _scan_dr,
447 _scan_pe,
448 _scan_kw,
449 _scan_ft,
450 _scan_sq,
451 _scan_sequence_data,
452 _scan_terminator
453 ]
454
456 """Consumer that converts a SwissProt record to a Record object.
457
458 Members:
459 data Record with SwissProt data.
460
461 """
463 self.data = None
464
467
471
475
477 cols = line.split()
478 #Prior to release 51, included with MoleculeType:
479 #ID EntryName DataClass; MoleculeType; SequenceLength.
480 #
481 #Newer files lack the MoleculeType:
482 #ID EntryName DataClass; SequenceLength.
483 #
484 #Note that cols is split on white space, so the length
485 #should become two fields (number and units)
486 if len(cols) == 6:
487 self.data.entry_name = cols[1]
488 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';'
489 self.data.molecule_type = cols[3].rstrip(_CHOMP) # don't want ';'
490 self.data.sequence_length = int(cols[4])
491 elif len(cols) == 5:
492 self.data.entry_name = cols[1]
493 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';'
494 self.data.molecule_type = None
495 self.data.sequence_length = int(cols[3])
496 else:
497 #Should we print a warning an continue?
498 raise ValueError("ID line has unrecognised format:\n"+line)
499
500 # data class can be 'STANDARD' or 'PRELIMINARY'
501 # ws:2001-12-05 added IPI
502 # pjc:2006-11-02 added 'Reviewed' and 'Unreviewed'
503 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI',
504 'Reviewed', 'Unreviewed']:
505 raise ValueError("Unrecognized data class %s in line\n%s" % \
506 (self.data.data_class, line))
507 # molecule_type should be 'PRT' for PRoTein
508 # Note that has been removed in recent releases (set to None)
509 if self.data.molecule_type is not None \
510 and self.data.molecule_type != 'PRT':
511 raise ValueError("Unrecognized molecule type %s in line\n%s" % \
512 (self.data.molecule_type, line))
513
515 cols = line[5:].rstrip(_CHOMP).strip().split(';')
516 for ac in cols:
517 if ac.strip():
518 #remove any leading or trailing white space
519 self.data.accessions.append(ac.strip())
520
522 uprline = line.upper()
523 cols = line.rstrip().split()
524
525 if uprline.find('CREATED') >= 0 \
526 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \
527 or uprline.find('LAST ANNOTATION UPDATE') >= 0:
528 # Old style DT line
529 # =================
530 # e.g.
531 # DT 01-FEB-1995 (Rel. 31, Created)
532 # DT 01-FEB-1995 (Rel. 31, Last sequence update)
533 # DT 01-OCT-2000 (Rel. 40, Last annotation update)
534 #
535 # or:
536 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created)
537 # ...
538
539 # find where the version information will be located
540 # This is needed for when you have cases like IPI where
541 # the release verison is in a different spot:
542 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created)
543 uprcols = uprline.split()
544 rel_index = -1
545 for index in range(len(uprcols)):
546 if uprcols[index].find("REL.") >= 0:
547 rel_index = index
548 assert rel_index >= 0, \
549 "Could not find Rel. in DT line: %s" % (line)
550 version_index = rel_index + 1
551 # get the version information
552 str_version = cols[version_index].rstrip(_CHOMP)
553 # no version number
554 if str_version == '':
555 version = 0
556 # dot versioned
557 elif str_version.find(".") >= 0:
558 version = str_version
559 # integer versioned
560 else:
561 version = int(str_version)
562
563 if uprline.find('CREATED') >= 0:
564 self.data.created = cols[1], version
565 elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
566 self.data.sequence_update = cols[1], version
567 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0:
568 self.data.annotation_update = cols[1], version
569 else:
570 assert False, "Shouldn't reach this line!"
571 elif uprline.find('INTEGRATED INTO') >= 0 \
572 or uprline.find('SEQUENCE VERSION') >= 0 \
573 or uprline.find('ENTRY VERSION') >= 0:
574 # New style DT line
575 # =================
576 # As of UniProt Knowledgebase release 7.0 (including
577 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the
578 # format of the DT lines and the version information
579 # in them was changed - the release number was dropped.
580 #
581 # For more information see bug 1948 and
582 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0
583 #
584 # e.g.
585 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot.
586 # DT 15-OCT-2001, sequence version 3.
587 # DT 01-APR-2004, entry version 14.
588 #
589 #This is a new style DT line...
590
591 # The date should be in string cols[1]
592 # Get the version number if there is one.
593 # For the three DT lines above: 0, 3, 14
594 try:
595 version = int(cols[-1])
596 except ValueError:
597 version = 0
598
599 # Re-use the historical property names, even though
600 # the meaning has changed slighty:
601 if uprline.find("INTEGRATED") >= 0:
602 self.data.created = cols[1], version
603 elif uprline.find('SEQUENCE VERSION') >= 0:
604 self.data.sequence_update = cols[1], version
605 elif uprline.find( 'ENTRY VERSION') >= 0:
606 self.data.annotation_update = cols[1], version
607 else:
608 assert False, "Shouldn't reach this line!"
609 else:
610 raise ValueError("I don't understand the date line %s" % line)
611
614
617
620
623
625 line = line[5:].rstrip(_CHOMP)
626 cols = line.split(';')
627 for col in cols:
628 self.data.organism_classification.append(col.lstrip())
629
631 # The OX line is in the format:
632 # OX DESCRIPTION=ID[, ID]...;
633 # If there are too many id's to fit onto a line, then the ID's
634 # continue directly onto the next line, e.g.
635 # OX DESCRIPTION=ID[, ID]...
636 # OX ID[, ID]...;
637 # Currently, the description is always "NCBI_TaxID".
638
639 # To parse this, I need to check to see whether I'm at the
640 # first line. If I am, grab the description and make sure
641 # it's an NCBI ID. Then, grab all the id's.
642 line = line[5:].rstrip(_CHOMP)
643 index = line.find('=')
644 if index >= 0:
645 descr = line[:index]
646 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
647 ids = line[index+1:].split(',')
648 else:
649 ids = line.split(',')
650 self.data.taxonomy_id.extend([id.strip() for id in ids])
651
653 # Line type OH (Organism Host) for viral hosts
654 # same code as in taxonomy_id()
655 line = line[5:].rstrip(_CHOMP)
656 index = line.find('=')
657 if index >= 0:
658 descr = line[:index]
659 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
660 ids = line[index+1:].split(',')
661 else:
662 ids = line.split(',')
663 self.data.host_organism.extend([id.strip() for id in ids])
664
666 rn = line[5:].rstrip()
667 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
668 ref = Reference()
669 ref.number = int(rn[1:-1])
670 self.data.references.append(ref)
671
673 assert self.data.references, "RP: missing RN"
674 self.data.references[-1].positions.append(line[5:].rstrip())
675
677 assert self.data.references, "RC: missing RN"
678 cols = line[5:].rstrip().split( ';')
679 ref = self.data.references[-1]
680 for col in cols:
681 if not col: # last column will be the empty string
682 continue
683 # The token is everything before the first '=' character.
684 index = col.find('=')
685 token, text = col[:index], col[index+1:]
686 # According to the spec, there should only be 1 '='
687 # character. However, there are too many exceptions to
688 # handle, so we'll ease up and allow anything after the
689 # first '='.
690 #if col == ' STRAIN=TISSUE=BRAIN':
691 # # from CSP_MOUSE, release 38
692 # token, text = "TISSUE", "BRAIN"
693 #elif col == ' STRAIN=NCIB 9816-4, AND STRAIN=G7 / ATCC 17485':
694 # # from NDOA_PSEPU, release 38
695 # token, text = "STRAIN", "NCIB 9816-4 AND G7 / ATCC 17485"
696 #elif col == ' STRAIN=ISOLATE=NO 27, ANNO 1987' or \
697 # col == ' STRAIN=ISOLATE=NO 27 / ANNO 1987':
698 # # from NU3M_BALPH, release 38, release 39
699 # token, text = "STRAIN", "ISOLATE NO 27, ANNO 1987"
700 #else:
701 # token, text = string.split(col, '=')
702 ref.comments.append((token.lstrip(), text))
703
705 assert self.data.references, "RX: missing RN"
706 # The basic (older?) RX line is of the form:
707 # RX MEDLINE; 85132727.
708 # but there are variants of this that need to be dealt with (see below)
709
710 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33
711 # have extraneous information in the RX line. Check for
712 # this and chop it out of the line.
713 # (noticed by katel@worldpath.net)
714 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
715 if ind >= 0:
716 line = line[:ind]
717
718 # RX lines can also be used of the form
719 # RX PubMed=9603189;
720 # reported by edvard@farmasi.uit.no
721 # and these can be more complicated like:
722 # RX MEDLINE=95385798; PubMed=7656980;
723 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781;
724 # We look for these cases first and deal with them
725 if line.find("=") != -1:
726 cols = line[2:].split("; ")
727 cols = [x.strip() for x in cols]
728 cols = [x for x in cols if x]
729 for col in cols:
730 x = col.split("=")
731 assert len(x) == 2, "I don't understand RX line %s" % line
732 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
733 ref = self.data.references[-1].references
734 ref.append((key, value))
735 # otherwise we assume we have the type 'RX MEDLINE; 85132727.'
736 else:
737 cols = line.split()
738 # normally we split into the three parts
739 assert len(cols) == 3, "I don't understand RX line %s" % line
740 self.data.references[-1].references.append(
741 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
742
747
749 assert self.data.references, "RT: missing RN"
750 ref = self.data.references[-1]
751 ref.title += line[5:]
752
754 assert self.data.references, "RL: missing RN"
755 ref = self.data.references[-1]
756 ref.location += line[5:]
757
759 if line[5:8] == '-!-': # Make a new comment
760 self.data.comments.append(line[9:])
761 elif line[5:8] == ' ': # add to the previous comment
762 if not self.data.comments:
763 # TCMO_STRGA in Release 37 has comment with no topic
764 self.data.comments.append(line[9:])
765 else:
766 self.data.comments[-1] += line[9:]
767 elif line[5:8] == '---':
768 # If there are no comments, and it's not the closing line,
769 # make a new comment.
770 if not self.data.comments or self.data.comments[-1][:3] != '---':
771 self.data.comments.append(line[5:])
772 else:
773 self.data.comments[-1] += line[5:]
774 else: # copyright notice
775 self.data.comments[-1] += line[5:]
776
778 # From CLD1_HUMAN, Release 39:
779 # DR EMBL; [snip]; -. [EMBL / GenBank / DDBJ] [CoDingSequence]
780 # DR PRODOM [Domain structure / List of seq. sharing at least 1 domai
781 # DR SWISS-2DPAGE; GET REGION ON 2D PAGE.
782 line = line[5:]
783 # Remove the comments at the end of the line
784 i = line.find('[')
785 if i >= 0:
786 line = line[:i]
787 cols = line.rstrip(_CHOMP).split(';')
788 cols = [col.lstrip() for col in cols]
789 self.data.cross_references.append(tuple(cols))
790
792 cols = line[5:].rstrip(_CHOMP).split(';')
793 self.data.keywords.extend([c.lstrip() for c in cols])
794
796 line = line[5:] # get rid of junk in front
797 name = line[0:8].rstrip()
798 try:
799 from_res = int(line[9:15])
800 except ValueError:
801 from_res = line[9:15].lstrip()
802 try:
803 to_res = int(line[16:22])
804 except ValueError:
805 to_res = line[16:22].lstrip()
806 description = line[29:70].rstrip()
807 #if there is a feature_id (FTId), store it away
808 if line[29:35]==r"/FTId=":
809 ft_id = line[35:70].rstrip()[:-1]
810 else:
811 ft_id =""
812 if not name: # is continuation of last one
813 assert not from_res and not to_res
814 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1]
815 del self.data.features[-1]
816 description = "%s %s" % (old_description, description)
817
818 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no
819 if name == "VARSPLIC":
820 description = self._fix_varsplic_sequences(description)
821 self.data.features.append((name, from_res, to_res, description,ft_id))
822
824 """Remove unwanted spaces in sequences.
825
826 During line carryover, the sequences in VARSPLIC can get mangled
827 with unwanted spaces like:
828 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
829 We want to check for this case and correct it as it happens.
830 """
831 descr_cols = description.split(" -> ")
832 if len(descr_cols) == 2:
833 first_seq = descr_cols[0]
834 second_seq = descr_cols[1]
835 extra_info = ''
836 # we might have more information at the end of the
837 # second sequence, which should be in parenthesis
838 extra_info_pos = second_seq.find(" (")
839 if extra_info_pos != -1:
840 extra_info = second_seq[extra_info_pos:]
841 second_seq = second_seq[:extra_info_pos]
842
843 # now clean spaces out of the first and second string
844 first_seq = first_seq.replace(" ", "")
845 second_seq = second_seq.replace(" ", "")
846
847 # reassemble the description
848 description = first_seq + " -> " + second_seq + extra_info
849
850 return description
851
855
857 cols = line.split()
858 assert len(cols) == 8, "I don't understand SQ line %s" % line
859 # Do more checking here?
860 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
861
863 #It should be faster to make a list of strings, and join them at the end.
864 self._sequence_lines.append(line.replace(" ", "").rstrip())
865
868
869 #def _clean(self, line, rstrip=1):
870 # if rstrip:
871 # return string.rstrip(line[5:])
872 # return line[5:]
873
875 # Remove trailing newlines
876 members = ['description', 'gene_name', 'organism', 'organelle']
877 for m in members:
878 attr = getattr(rec, m)
879 setattr(rec, m, attr.rstrip())
880 for ref in rec.references:
881 self._clean_references(ref)
882
884 # Remove trailing newlines
885 members = ['authors', 'title', 'location']
886 for m in members:
887 attr = getattr(ref, m)
888 setattr(ref, m, attr.rstrip())
889
891 """Consumer that converts a SwissProt record to a SeqRecord object.
892
893 Members:
894 data Record with SwissProt data.
895 alphabet The alphabet the generated Seq objects will have.
896 """
897 #TODO - Cope with references as done for GenBank
899 """Initialize a Sequence Consumer
900
901 Arguments:
902 o alphabet - The alphabet to use for the generated Seq objects. If
903 not supplied this will default to the generic protein alphabet.
904 """
905 self.data = None
906 self.alphabet = alphabet
907
909 seq = Seq.Seq("", self.alphabet)
910 self.data = SeqRecord.SeqRecord(seq)
911 self.data.description = ""
912 self.data.name = ""
913 self._current_ref = None
914 self._sequence_lines = []
915
917 if self._current_ref is not None:
918 self.data.annotations['references'].append(self._current_ref)
919 self._current_ref = None
920 self.data.description = self.data.description.rstrip()
921 self.data.seq = Seq.Seq("".join(self._sequence_lines), self.alphabet)
922 self.data.annotations['organism'] = self.data.annotations['organism'].rstrip(_CHOMP)
923
927
929 #Note that files can and often do contain multiple AC lines.
930 ids = line[5:].strip().split(';')
931 #Remove any white space
932 ids = [x.strip() for x in ids if x.strip()]
933
934 #Use the first as the ID, but record them ALL in the annotations
935 try:
936 self.data.annotations['accessions'].extend(ids)
937 except KeyError:
938 self.data.annotations['accessions'] = ids
939
940 #Use the FIRST accession as the ID, not the first on this line!
941 self.data.id = self.data.annotations['accessions'][0]
942 #self.data.id = ids[0]
943
946
948 #It should be faster to make a list of strings, and join them at the end.
949 self._sequence_lines.append(line.replace(" ", "").rstrip())
950
952 #We already store the identification/accession as the records name/id
953 try:
954 self.data.annotations['gene_name'] += " " + line[5:].rstrip()
955 except KeyError:
956 self.data.annotations['gene_name'] = line[5:].rstrip()
957
959 #Try and agree with SeqRecord convention from the GenBank parser,
960 #which stores the comments as a long string with newlines
961 #with key 'comment'
962 #TODO - Follow SwissProt conventions more closely?
963 prefix = line[5:8]
964 text = line[9:].rstrip()
965 if prefix == '-!-': # Make a new comment
966 try:
967 self.data.annotations['comment'] += "\n" + text
968 except KeyError:
969 self.data.annotations['comment'] = text
970 elif prefix == ' ':
971 try:
972 # add to the previous comment
973 self.data.annotations['comment'] += " " + text
974 except KeyError:
975 # TCMO_STRGA in Release 37 has comment with no topic
976 self.data.annotations['comment'] = text
977
979 #Format of the line is described in the manual dated 04-Dec-2007 as:
980 #DR DATABASE; PRIMARY; SECONDARY[; TERTIARY][; QUATERNARY].
981 #However, some older files only seem to have a single identifier:
982 #DR DATABASE; PRIMARY.
983 #
984 #Also must cope with things like this from Tests/SwissProt/sp007,
985 #DR PRODOM [Domain structure / List of seq. sharing at least 1 domain]
986 #
987 #Store these in the dbxref list, but for consistency with
988 #the GenBank parser and with what BioSQL can cope with,
989 #store only DATABASE_IDENTIFIER:PRIMARY_IDENTIFIER
990 parts = [x.strip() for x in line[5:].strip(_CHOMP).split(";")]
991 if len(parts) > 1:
992 value = "%s:%s" % (parts[0], parts[1])
993 #Avoid duplicate entries
994 if value not in self.data.dbxrefs:
995 self.data.dbxrefs.append(value)
996 #else:
997 #print "Bad DR line:\n%s" % line
998
999
1001 date_str = line.split()[1]
1002 uprline = line.upper()
1003 if uprline.find('CREATED') >= 0:
1004 #Try and agree with SeqRecord convention from the GenBank parser,
1005 #which stores the submitted date as 'date'
1006 self.data.annotations['date'] = date_str
1007 elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
1008 #There is no existing convention from the GenBank SeqRecord parser
1009 self.data.annotations['date_last_sequence_update'] = date_str
1010 elif uprline.find('LAST ANNOTATION UPDATE') >= 0:
1011 #There is no existing convention from the GenBank SeqRecord parser
1012 self.data.annotations['date_last_annotation_update'] = date_str
1013 elif uprline.find('INTEGRATED INTO') >= 0:
1014 self.data.annotations['date'] = date_str.rstrip(",")
1015 elif uprline.find('SEQUENCE VERSION') >= 0:
1016 self.data.annotations['date_last_sequence_update'] = date_str.rstrip(",")
1017 elif uprline.find('ENTRY VERSION') >= 0:
1018 self.data.annotations['date_last_annotation_update'] = date_str.rstrip(",")
1019
1021 #Try and agree with SeqRecord convention from the GenBank parser,
1022 #which stores a list as 'keywords'
1023 cols = line[5:].rstrip(_CHOMP).split(';')
1024 cols = [c.strip() for c in cols]
1025 cols = filter(None, cols)
1026 try:
1027 #Extend any existing list of keywords
1028 self.data.annotations['keywords'].extend(cols)
1029 except KeyError:
1030 #Create the list of keywords
1031 self.data.annotations['keywords'] = cols
1032
1034 #Try and agree with SeqRecord convention from the GenBank parser,
1035 #which stores the organism as a string with key 'organism'
1036 data = line[5:].rstrip()
1037 try:
1038 #Append to any existing data split over multiple lines
1039 self.data.annotations['organism'] += " " + data
1040 except KeyError:
1041 self.data.annotations['organism'] = data
1042
1044 #There is no SeqRecord convention from the GenBank parser,
1045 data = line[5:].rstrip(_CHOMP)
1046 index = data.find('=')
1047 if index >= 0:
1048 descr = data[:index]
1049 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
1050 ids = data[index+1:].split(',')
1051 else:
1052 ids = data.split(',')
1053
1054 try:
1055 #Append to any existing data
1056 self.data.annotations['organism_host'].extend(ids)
1057 except KeyError:
1058 self.data.annotations['organism_host'] = ids
1059
1061 #Try and agree with SeqRecord convention from the GenBank parser,
1062 #which stores this taxonomy lineage ese as a list of strings with
1063 #key 'taxonomy'.
1064 #Note that 'ncbi_taxid' is used for the taxonomy ID (line OX)
1065 line = line[5:].rstrip(_CHOMP)
1066 cols = [col.strip() for col in line.split(';')]
1067 try:
1068 #Append to any existing data
1069 self.data.annotations['taxonomy'].extend(cols)
1070 except KeyError:
1071 self.data.annotations['taxonomy'] = cols
1072
1074 #Try and agree with SeqRecord convention expected in BioSQL
1075 #the NCBI taxon id with key 'ncbi_taxid'.
1076 #Note that 'taxonomy' is used for the taxonomy lineage
1077 #(held as a list of strings, line type OC)
1078
1079 line = line[5:].rstrip(_CHOMP)
1080 index = line.find('=')
1081 if index >= 0:
1082 descr = line[:index]
1083 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
1084 ids = line[index+1:].split(',')
1085 else:
1086 ids = line.split(',')
1087
1088 try:
1089 #Append to any existing data
1090 self.data.annotations['ncbi_taxid'].extend(ids)
1091 except KeyError:
1092 self.data.annotations['ncbi_taxid'] = ids
1093
1095 """RN line, reference number (start of new reference)."""
1096 from Bio.SeqFeature import Reference
1097 # if we have a current reference that hasn't been added to
1098 # the list of references, add it.
1099 if self._current_ref is not None:
1100 self.data.annotations['references'].append(self._current_ref)
1101 else:
1102 self.data.annotations['references'] = []
1103
1104 self._current_ref = Reference()
1105
1107 """RP line, reference position."""
1108 assert self._current_ref is not None, "RP: missing RN"
1109 #Should try and store this in self._current_ref.location
1110 #but the SwissProt locations don't match easily to the
1111 #format used in GenBank...
1112 pass
1113
1115 """RX line, reference cross-references."""
1116 assert self._current_ref is not None, "RX: missing RN"
1117 # The basic (older?) RX line is of the form:
1118 # RX MEDLINE; 85132727.
1119 # or more recently:
1120 # RX MEDLINE=95385798; PubMed=7656980;
1121 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781;
1122 # We look for these cases first and deal with them
1123 if line.find("=") != -1:
1124 cols = line[2:].split("; ")
1125 cols = [x.strip() for x in cols]
1126 cols = [x for x in cols if x]
1127 for col in cols:
1128 x = col.split("=")
1129 assert len(x) == 2, "I don't understand RX line %s" % line
1130 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
1131 if key == "MEDLINE":
1132 self._current_ref.medline_id = value
1133 elif key == "PubMed":
1134 self._current_ref.pubmed_id = value
1135 else:
1136 #Sadly the SeqFeature.Reference object doesn't
1137 #support anything else (yet)
1138 pass
1139 # otherwise we assume we have the type 'RX MEDLINE; 85132727.'
1140 else:
1141 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33
1142 # have extraneous information in the RX line. Check for
1143 # this and chop it out of the line.
1144 # (noticed by katel@worldpath.net)
1145 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
1146 if ind >= 0:
1147 line = line[:ind]
1148 cols = line.split()
1149 # normally we split into the three parts
1150 assert len(cols) == 3, "I don't understand RX line %s" % line
1151 key = cols[1].rstrip(_CHOMP)
1152 value = cols[2].rstrip(_CHOMP)
1153 if key == "MEDLINE":
1154 self._current_ref.medline_id = value
1155 elif key == "PubMed":
1156 self._current_ref.pubmed_id = value
1157 else:
1158 #Sadly the SeqFeature.Reference object doesn't
1159 #support anything else (yet)
1160 pass
1161
1168
1170 """RT line, reference title."""
1171 assert self._current_ref is not None, "RT: missing RN"
1172 if self._current_ref.title:
1173 self._current_ref.title += " "
1174 self._current_ref.title += line[5:].rstrip("\n")
1175
1177 """RL line, reference 'location' - journal, volume, pages, year."""
1178 assert self._current_ref is not None, "RL: missing RN"
1179 if self._current_ref.journal:
1180 self._current_ref.journal += " "
1181 self._current_ref.journal += line[5:].rstrip("\n")
1182
1184 """RC line, reference comment."""
1185 assert self._current_ref is not None, "RC: missing RN"
1186 #This has a key=value; structure...
1187 #Can we do a better job with the current Reference class?
1188 if self._current_ref.comment:
1189 self._current_ref.comment += " "
1190 self._current_ref.comment += line[5:].rstrip("\n")
1191
1193 """index_file(filename, indexname, rec2key=None)
1194
1195 Index a SwissProt file. filename is the name of the file.
1196 indexname is the name of the dictionary. rec2key is an
1197 optional callback that takes a Record and generates a unique key
1198 (e.g. the accession number) for the record. If not specified,
1199 the entry name will be used.
1200
1201 """
1202 from Bio.SwissProt import parse
1203 if not os.path.exists(filename):
1204 raise ValueError("%s does not exist" % filename)
1205
1206 index = Index.Index(indexname, truncate=1)
1207 index[Dictionary._Dictionary__filename_key] = filename
1208
1209 handle = open(filename)
1210 records = parse(handle)
1211 end = 0L
1212 for record in records:
1213 start = end
1214 end = long(handle.tell())
1215 length = end - start
1216
1217 if rec2key is not None:
1218 key = rec2key(record)
1219 else:
1220 key = record.entry_name
1221
1222 if not key:
1223 raise KeyError("empty sequence key was produced")
1224 elif key in index:
1225 raise KeyError("duplicate key %s found" % key)
1226
1227 index[key] = start, length
1228
| Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Wed Dec 16 11:27:39 2009 | http://epydoc.sourceforge.net |