| Trees | Indices | Help |
|---|
|
|
1 # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved.
2 # This code is part of the Biopython distribution and governed by its
3 # license. Please see the LICENSE file that should have been included
4 # as part of this package.
5 #
6 # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla.
7 """Nexus class. Parse the contents of a NEXUS file.
8
9 Based upon 'NEXUS: An extensible file format for systematic information'
10 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
11 """
12
13 import os,sys, math, random, copy
14
15 from Bio.Alphabet import IUPAC
16 from Bio.Data import IUPACData
17 from Bio.Seq import Seq
18
19 from Trees import Tree,NodeData
20
21 try:
22 import cnexus
23 except ImportError:
24 C=False
25 else:
26 C=True
27
28 INTERLEAVE=70
29 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
30 'matrix','tree', 'utree','translate','codonposset','title']
31 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons']
32 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
33 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
34 WHITESPACE=' \t\n'
35 #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments
36 SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored
37 CHARSET='chars'
38 TAXSET='taxa'
39 CODONPOSITIONS='codonpositions'
40 DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; '
42
44 """Helps reading NEXUS-words and characters from a buffer."""
50
56
63
69
71 while True:
72 p=self.next()
73 if p is None:
74 break
75 if p not in WHITESPACE:
76 return p
77 return None
78
82
84 for t in target:
85 try:
86 pos=self.buffer.index(t)
87 except ValueError:
88 pass
89 else:
90 found=''.join(self.buffer[:pos])
91 self.buffer=self.buffer[pos:]
92 return found
93 else:
94 return None
95
98
100 """Return the next NEXUS word from a string.
101
102 This deals with single and double quotes, whitespace and punctuation.
103 """
104
105 word=[]
106 quoted=False
107 first=self.next_nonwhitespace() # get first character
108 if not first: # return empty if only whitespace left
109 return None
110 word.append(first)
111 if first=="'": # word starts with a quote
112 quoted="'"
113 elif first=='"':
114 quoted='"'
115 elif first in PUNCTUATION: # if it's punctuation, return immediately
116 return first
117 while True:
118 c=self.peek()
119 if c==quoted: # a quote?
120 word.append(self.next()) # store quote
121 if self.peek()==quoted: # double quote
122 skip=self.next() # skip second quote
123 elif quoted: # second single quote ends word
124 break
125 elif quoted:
126 word.append(self.next()) # if quoted, then add anything
127 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop
128 break
129 else:
130 word.append(self.next()) # standard character
131 return ''.join(word)
132
136
138 """Calculate a stepmatrix for weighted parsimony.
139
140 See Wheeler (1990), Cladistics 6:269-275.
141 """
142
144 self.data={}
145 self.symbols=[s for s in symbols]
146 self.symbols.sort()
147 if gap:
148 self.symbols.append(gap)
149 for x in self.symbols:
150 for y in [s for s in self.symbols if s!=x]:
151 self.set(x,y,0)
152
157
162
165
167 total=self.sum()
168 if total!=0:
169 for k in self.data:
170 self.data[k]=self.data[k]/float(total)
171 return self
172
174 for k in self.data:
175 if self.data[k]!=0:
176 self.data[k]=-math.log(self.data[k])
177 return self
178
180 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
181 matrix+=' %s\n' % ' '.join(self.symbols)
182 for x in self.symbols:
183 matrix+='[%s]'.ljust(8) % x
184 for y in self.symbols:
185 if x==y:
186 matrix+=' . '
187 else:
188 if x>y:
189 x1,y1=y,x
190 else:
191 x1,y1=x,y
192 if self.data[x1+y1]==0:
193 matrix+='inf. '
194 else:
195 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
196 matrix+='\n'
197 matrix+=';\n'
198 return matrix
199
201 """Return a taxon identifier according to NEXUS standard.
202
203 Wrap quotes around names with punctuation or whitespace, and double
204 single quotes.
205
206 mrbayes=True: write names without quotes, whitespace or punctuation
207 for the mrbayes software package.
208 """
209 if mrbayes:
210 safe=name.replace(' ','_')
211 safe=''.join([c for c in safe if c in MRBAYESSAFE])
212 else:
213 safe=name.replace("'","''")
214 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)):
215 safe="'"+safe+"'"
216 return safe
217
219 """Remove quotes and/or double quotes around identifiers."""
220 if not word:
221 return None
222 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
223 word=word[1:-1]
224 return word
225
227 """Return position of first and last character which is not in skiplist.
228
229 Skiplist defaults to ['-','?'])."""
230
231 length=len(sequence)
232 if length==0:
233 return None,None
234 end=length-1
235 while end>=0 and (sequence[end] in skiplist):
236 end-=1
237 start=0
238 while start<length and (sequence[start] in skiplist):
239 start+=1
240 if start==length and end==-1: # empty sequence
241 return -1,-1
242 else:
243 return start,end
244
246 """Returns a sorted list of keys of p sorted by values of p."""
247 startpos=[(p[pn],pn) for pn in p if p[pn]]
248 startpos.sort()
249 # parenthisis added because of py3k
250 return (zip(*startpos))[1]
251
253 """Check that all values in list are unique and return a pruned and sorted list."""
254 l=list(set(l))
255 l.sort()
256 return l
257
259 """Returns a unique name if label is already in previous_labels."""
260 while label in previous_labels:
261 if label.split('.')[-1].startswith('copy'):
262 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1)
263 else:
264 label+='.copy'
265 return label
266
268 """Converts a Seq-object matrix to a plain sequence-string matrix."""
269 return dict([(t,matrix[t].tostring()) for t in matrix])
270
272 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
273 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
274 """
275
276 if not orig_list:
277 return ''
278 orig_list=list(set(orig_list))
279 orig_list.sort()
280 shortlist=[]
281 clist=orig_list[:]
282 clist.append(clist[-1]+.5) # dummy value makes it easier
283 while len(clist)>1:
284 step=1
285 for i,x in enumerate(clist):
286 if x==clist[0]+i*step: # are we still in the right step?
287 continue
288 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
289 # second element, and possibly at least 3 elements to link,
290 # and the next one is in the right step
291 step=x-clist[0]
292 else: # pattern broke, add all values before current position to new list
293 sub=clist[:i]
294 if len(sub)==1:
295 shortlist.append(str(sub[0]+1))
296 else:
297 if step==1:
298 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
299 else:
300 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
301 clist=clist[i:]
302 break
303 return ' '.join(shortlist)
304
306 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
307
308 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
309 Character sets, character partitions and taxon sets are prefixed, readjusted
310 and present in the combined matrix.
311 """
312
313 if not matrices:
314 return None
315 name=matrices[0][0]
316 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix
317 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1)
318 if mixed_datatypes:
319 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself!
320 # raise NexusError('Matrices must be of same datatype')
321 combined.charlabels=None
322 combined.statelabels=None
323 combined.interleave=False
324 combined.translate=None
325
326 # rename taxon sets and character sets and name them with prefix
327 for cn,cs in combined.charsets.iteritems():
328 combined.charsets['%s.%s' % (name,cn)]=cs
329 del combined.charsets[cn]
330 for tn,ts in combined.taxsets.iteritems():
331 combined.taxsets['%s.%s' % (name,tn)]=ts
332 del combined.taxsets[tn]
333 # previous partitions usually don't make much sense in combined matrix
334 # just initiate one new partition parted by single matrices
335 combined.charpartitions={'combined':{name:range(combined.nchar)}}
336 for n,m in matrices[1:]: # add all other matrices
337 both=[t for t in combined.taxlabels if t in m.taxlabels]
338 combined_only=[t for t in combined.taxlabels if t not in both]
339 m_only=[t for t in m.taxlabels if t not in both]
340 for t in both:
341 # concatenate sequences and unify gap and missing character symbols
342 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
343 # replace date of missing taxa with symbol for missing data
344 for t in combined_only:
345 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
346 for t in m_only:
347 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
348 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
349 combined.taxlabels.extend(m_only) # new taxon list
350 for cn,cs in m.charsets.iteritems(): # adjust character sets for new matrix
351 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
352 if m.taxsets:
353 if not combined.taxsets:
354 combined.taxsets={}
355 # update taxon sets
356 combined.taxsets.update(dict(('%s.%s' % (n,tn),ts) \
357 for tn,ts in m.taxsets.iteritems()))
358 # update new charpartition
359 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)
360 # update charlabels
361 if m.charlabels:
362 if not combined.charlabels:
363 combined.charlabels={}
364 combined.charlabels.update(dict((combined.nchar+i,label) \
365 for (i,label) in m.charlabels.iteritems()))
366 combined.nchar+=m.nchar # update nchar and ntax
367 combined.ntax+=len(m_only)
368
369 # some prefer partitions, some charsets:
370 # make separate charset for ecah initial dataset
371 for c in combined.charpartitions['combined']:
372 combined.charsets[c]=combined.charpartitions['combined'][c]
373
374 return combined
375
377 """Delete []-delimited comments out of a file and break into lines separated by ';'.
378
379 stripped_text=_kill_comments_and_break_lines(text):
380 Nested and multiline comments are allowed. [ and ] symbols within single
381 or double quotes are ignored, newline ends a quote, all symbols with quotes are
382 treated the same (thus not quoting inside comments like [this character ']' ends a comment])
383 Special [&...] and [\...] comments remain untouched, if not inside standard comment.
384 Quotes inside special [& and [\ are treated as normal characters,
385 but no nesting inside these special comments allowed (like [& [\ ]]).
386 ';' ist deleted from end of line.
387
388 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus
389 """
390 import warnings
391 warnings.warn("This function is very slow for large files, and obsolete when using C extension cnexus", PendingDeprecationWarning)
392 contents=CharBuffer(text)
393 newtext=[]
394 newline=[]
395 quotelevel=''
396 speciallevel=False
397 commlevel=0
398 while True:
399 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character
400 #if not plain:
401 # newline.append(contents.rest) # not found, just add the rest
402 # break
403 #newline.append(plain) # add intermediate text
404 t=contents.next() # and get special character
405 if t is None:
406 break
407 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation
408 quotelevel=''
409 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation
410 quotelevel=t
411 elif not quotelevel and t=='[': # opening bracket outside a quote
412 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel:
413 speciallevel=True
414 else:
415 commlevel+=1
416 elif not quotelevel and t==']': # closing bracket ioutside a quote
417 if speciallevel:
418 speciallevel=False
419 else:
420 commlevel-=1
421 if commlevel<0:
422 raise NexusError('Nexus formatting error: unmatched ]')
423 continue
424 if commlevel==0: # copy if we're not in comment
425 if t==';' and not quotelevel:
426 newtext.append(''.join(newline))
427 newline=[]
428 else:
429 newline.append(t)
430 #level of comments should be 0 at the end of the file
431 if newline:
432 newtext.append('\n'.join(newline))
433 if commlevel>0:
434 raise NexusError('Nexus formatting error: unmatched [')
435 return newtext
436
437
439 """Adjust linebreaks to match ';', strip leading/trailing whitespace.
440
441 list_of_commandlines=_adjust_lines(input_text)
442 Lines are adjusted so that no linebreaks occur within a commandline
443 (except matrix command line)
444 """
445 formatted_lines=[]
446 for l in lines:
447 #Convert line endings
448 l=l.replace('\r\n','\n').replace('\r','\n').strip()
449 if l.lower().startswith('matrix'):
450 formatted_lines.append(l)
451 else:
452 l=l.replace('\n',' ')
453 if l:
454 formatted_lines.append(l)
455 return formatted_lines
456
458 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
459
460 opening=seq.find('(')
461 while opening>-1:
462 closing=seq.find(')')
463 if closing<0:
464 raise NexusError('Missing closing parenthesis in: '+seq)
465 elif closing<opening:
466 raise NexusError('Missing opening parenthesis in: '+seq)
467 ambig=[x for x in seq[opening+1:closing]]
468 ambig.sort()
469 ambig=''.join(ambig)
470 ambig_code=rev_ambig_values[ambig.upper()]
471 if ambig!=ambig.upper():
472 ambig_code=ambig_code.lower()
473 seq=seq[:opening]+ambig_code+seq[closing+1:]
474 opening=seq.find('(')
475 return seq
476
478 """Represent a commandline as command and options."""
479
481 self.options={}
482 options=[]
483 self.command=None
484 try:
485 #Assume matrix (all other command lines have been stripped of \n)
486 self.command, options = line.strip().split('\n', 1)
487 except ValueError: #Not matrix
488 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...)
489 self.command=line.split()[0]
490 options=' '.join(line.split()[1:])
491 self.command = self.command.strip().lower()
492 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved
493 self.options=options.strip()
494 else:
495 if len(options) > 0:
496 try:
497 options = options.replace('=', ' = ').split()
498 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
499 indices = []
500 for sl in valued_indices:
501 indices.extend(sl)
502 token_indices = [n for n in range(len(options)) if n not in indices]
503 for opt in valued_indices:
504 #self.options[options[opt[0]].lower()] = options[opt[2]].lower()
505 self.options[options[opt[0]].lower()] = options[opt[2]]
506 for token in token_indices:
507 self.options[options[token].lower()] = None
508 except ValueError:
509 raise NexusError('Incorrect formatting in line: %s' % line)
510
516
518
519 __slots__=['original_taxon_order','__dict__']
520
522 self.ntax=0 # number of taxa
523 self.nchar=0 # number of characters
524 self.unaltered_taxlabels=[] # taxlabels as the appear in the input file (incl. duplicates, etc.)
525 self.taxlabels=[] # labels for taxa, ordered by their id
526 self.charlabels=None # ... and for characters
527 self.statelabels=None # ... and for states
528 self.datatype='dna' # (standard), dna, rna, nucleotide, protein
529 self.respectcase=False # case sensitivity
530 self.missing='?' # symbol for missing characters
531 self.gap='-' # symbol for gap
532 self.symbols=None # set of symbols
533 self.equate=None # set of symbol synonyms
534 self.matchchar=None # matching char for matrix representation
535 self.labels=None # left, right, no
536 self.transpose=False # whether matrix is transposed
537 self.interleave=False # whether matrix is interleaved
538 self.tokens=False # unsupported
539 self.eliminate=None # unsupported
540 self.matrix=None # ...
541 self.unknown_blocks=[] # blocks we don't care about
542 self.taxsets={}
543 self.charsets={}
544 self.charpartitions={}
545 self.taxpartitions={}
546 self.trees=[] # list of Trees (instances of Tree class)
547 self.translate=None # Dict to translate taxon <-> taxon numbers
548 self.structured=[] # structured input representation
549 self.set={} # dict of the set command to set various options
550 self.options={} # dict of the options command in the data block
551 self.codonposset=None # name of the charpartition that defines codon positions
552
553 # some defaults
554 self.options['gapmode']='missing'
555
556 if input:
557 self.read(input)
558 else:
559 self.read(DEFAULTNEXUS)
560
567 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
568
570 """Read and parse NEXUS imput (a filename, file-handle, or string)."""
571
572 # 1. Assume we have the name of a file in the execution dir
573 # Note we need to add parsing of the path to dir/filename
574 try:
575 file_contents = open(os.path.expanduser(input),'rU').read()
576 self.filename=input
577 except (TypeError,IOError,AttributeError):
578 #2 Assume we have a string from a fh.read()
579 if isinstance(input, str):
580 file_contents = input
581 self.filename='input_string'
582 #3 Assume we have a file object
583 elif hasattr(input,'read'): # file objects or StringIO objects
584 file_contents=input.read()
585 if hasattr(input,"name") and input.name:
586 self.filename=input.name
587 else:
588 self.filename='Unknown_nexus_file'
589 else:
590 print input.strip()[:50]
591 raise NexusError('Unrecognized input: %s ...' % input[:100])
592 file_contents=file_contents.strip()
593 if file_contents.startswith('#NEXUS'):
594 file_contents=file_contents[6:]
595 if C:
596 decommented=cnexus.scanfile(file_contents)
597 #check for unmatched parentheses
598 if decommented=='[' or decommented==']':
599 raise NexusError('Unmatched %s' % decommented)
600 # cnexus can't return lists, so in analogy we separate commandlines with chr(7)
601 # (a character that shoudn't be part of a nexus file under normal circumstances)
602 commandlines=_adjust_lines(decommented.split(chr(7)))
603 else:
604 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
605 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times'
606 for i,cl in enumerate(commandlines):
607 try:
608 if cl[:6].upper()=='#NEXUS':
609 commandlines[i]=cl[6:].strip()
610 except:
611 pass
612 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands
613 nexus_block_gen = self._get_nexus_block(commandlines)
614 while 1:
615 try:
616 title, contents = nexus_block_gen.next()
617 except StopIteration:
618 break
619 if title in KNOWN_NEXUS_BLOCKS:
620 self._parse_nexus_block(title, contents)
621 else:
622 self._unknown_nexus_block(title, contents)
623
625 """Generator for looping through Nexus blocks."""
626 inblock=False
627 blocklines=[]
628 while file_contents:
629 cl=file_contents.pop(0)
630 if cl.lower().startswith('begin'):
631 if not inblock:
632 inblock=True
633 title=cl.split()[1].lower()
634 else:
635 raise NexusError('Illegal block nesting in block %s' % title)
636 elif cl.lower().startswith('end'):
637 if inblock:
638 inblock=False
639 yield title,blocklines
640 blocklines=[]
641 else:
642 raise NexusError('Unmatched \'end\'.')
643 elif inblock:
644 blocklines.append(cl)
645
647 block = Block()
648 block.commandlines.append(contents)
649 block.title = title
650 self.unknown_blocks.append(block)
651
653 """Parse a known Nexus Block (PRIVATE)."""
654 # attached the structered block representation
655 self._apply_block_structure(title, contents)
656 #now check for taxa,characters,data blocks. If this stuff is defined more than once
657 #the later occurences will override the previous ones.
658 block=self.structured[-1]
659 for line in block.commandlines:
660 try:
661 getattr(self,'_'+line.command)(line.options)
662 except AttributeError:
663 raise
664 raise NexusError('Unknown command: %s ' % line.command)
665
668
670 if 'ntax' in options:
671 self.ntax=eval(options['ntax'])
672 if 'nchar' in options:
673 self.nchar=eval(options['nchar'])
674
676 # print options
677 # we first need to test respectcase, then symbols (which depends on respectcase)
678 # then datatype (which, if standard, depends on symbols and respectcase in order to generate
679 # dicts for ambiguous values and alphabet
680 if 'respectcase' in options:
681 self.respectcase=True
682 # adjust symbols to for respectcase
683 if 'symbols' in options:
684 self.symbols=options['symbols']
685 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\
686 (self.symbold.startswith("'") and self.symbols.endswith("'")):
687 self.symbols=self.symbols[1:-1].replace(' ','')
688 if not self.respectcase:
689 self.symbols=self.symbols.lower()+self.symbols.upper()
690 self.symbols=list(set(self.symbols))
691 if 'datatype' in options:
692 self.datatype=options['datatype'].lower()
693 if self.datatype=='dna' or self.datatype=='nucleotide':
694 self.alphabet=copy.deepcopy(IUPAC.ambiguous_dna)
695 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_dna_values)
696 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_dna_letters)
697 elif self.datatype=='rna':
698 self.alphabet=copy.deepcopy(IUPAC.ambiguous_rna)
699 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_rna_values)
700 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_rna_letters)
701 elif self.datatype=='protein':
702 self.alphabet=copy.deepcopy(IUPAC.protein)
703 self.ambiguous_values={'B':'DN','Z':'EQ','X':copy.deepcopy(IUPACData.protein_letters)} # that's how PAUP handles it
704 self.unambiguous_letters=copy.deepcopy(IUPACData.protein_letters)+'*' # stop-codon
705 elif self.datatype=='standard':
706 raise NexusError('Datatype standard is not yet supported.')
707 #self.alphabet=None
708 #self.ambiguous_values={}
709 #if not self.symbols:
710 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states
711 #self.unambiguous_letters=self.symbols
712 else:
713 raise NexusError('Unsupported datatype: '+self.datatype)
714 self.valid_characters=''.join(self.ambiguous_values)+self.unambiguous_letters
715 if not self.respectcase:
716 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper()
717 #we have to sort the reverse ambig coding dict key characters:
718 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N'
719 rev=dict((i[1],i[0]) for i in self.ambiguous_values.iteritems() if i[0]!='X')
720 self.rev_ambiguous_values={}
721 for (k,v) in rev.iteritems():
722 key=[c for c in k]
723 key.sort()
724 self.rev_ambiguous_values[''.join(key)]=v
725 #overwrite symbols for datype rna,dna,nucleotide
726 if self.datatype in ['dna','rna','nucleotide']:
727 self.symbols=self.alphabet.letters
728 if self.missing not in self.ambiguous_values:
729 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap
730 self.ambiguous_values[self.gap]=self.gap
731 elif self.datatype=='standard':
732 if not self.symbols:
733 self.symbols=['1','0']
734 if 'missing' in options:
735 self.missing=options['missing'][0]
736 if 'gap' in options:
737 self.gap=options['gap'][0]
738 if 'equate' in options:
739 self.equate=options['equate']
740 if 'matchchar' in options:
741 self.matchchar=options['matchchar'][0]
742 if 'labels' in options:
743 self.labels=options['labels']
744 if 'transpose' in options:
745 raise NexusError('TRANSPOSE is not supported!')
746 self.transpose=True
747 if 'interleave' in options:
748 if options['interleave']==None or options['interleave'].lower()=='yes':
749 self.interleave=True
750 if 'tokens' in options:
751 self.tokens=True
752 if 'notokens' in options:
753 self.tokens=False
754
755
757 self.set=options;
758
761
764
766 """Get taxon labels (PRIVATE).
767
768 As the taxon names are already in the matrix, this is superfluous
769 except for transpose matrices, which are currently unsupported anyway.
770 Thus, we ignore the taxlabels command to make handling of duplicate
771 taxon names easier.
772 """
773 pass
774 #self.taxlabels=[]
775 #opts=CharBuffer(options)
776 #while True:
777 # taxon=quotestrip(opts.next_word())
778 # if not taxon:
779 # break
780 # self.taxlabels.append(taxon)
781
783 """Check for presence of taxon in self.taxlabels."""
784 # According to NEXUS standard, underscores shall be treated as spaces...,
785 # so checking for identity is more difficult
786 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
787 nexid=taxon.replace(' ','_')
788 return nextaxa.get(nexid)
789
791 self.charlabels={}
792 opts=CharBuffer(options)
793 while True:
794 try:
795 # get id and state
796 w=opts.next_word()
797 if w is None: # McClade saves and reads charlabel-lists with terminal comma?!
798 break
799 identifier=self._resolve(w,set_type=CHARSET)
800 state=quotestrip(opts.next_word())
801 self.charlabels[identifier]=state
802 # check for comma or end of command
803 c=opts.next_nonwhitespace()
804 if c is None:
805 break
806 elif c!=',':
807 raise NexusError('Missing \',\' in line %s.' % options)
808 except NexusError:
809 raise
810 except:
811 raise NexusError('Format error in line %s.' % options)
812
816
818 #self.charlabels=options
819 #print 'Command statelabels is not supported and will be ignored.'
820 pass
821
823 if not self.ntax or not self.nchar:
824 raise NexusError('Dimensions must be specified before matrix!')
825 self.matrix={}
826 taxcount=0
827 first_matrix_block=True
828
829 #eliminate empty lines and leading/trailing whitespace
830 lines=[l.strip() for l in options.split('\n') if l.strip()!='']
831 lineiter=iter(lines)
832 while 1:
833 try:
834 l=lineiter.next()
835 except StopIteration:
836 if taxcount<self.ntax:
837 raise NexusError('Not enough taxa in matrix.')
838 elif taxcount>self.ntax:
839 raise NexusError('Too many taxa in matrix.')
840 else:
841 break
842 # count the taxa and check for interleaved matrix
843 taxcount+=1
844 ##print taxcount
845 if taxcount>self.ntax:
846 if not self.interleave:
847 raise NexusError('Too many taxa in matrix - should matrix be interleaved?')
848 else:
849 taxcount=1
850 first_matrix_block=False
851 #get taxon name and sequence
852 linechars=CharBuffer(l)
853 id=quotestrip(linechars.next_word())
854 l=linechars.rest().strip()
855 chars=''
856 if self.interleave:
857 #interleaved matrix
858 #print 'In interleave'
859 if l:
860 chars=''.join(l.split())
861 else:
862 chars=''.join(lineiter.next().split())
863 else:
864 #non-interleaved matrix
865 chars=''.join(l.split())
866 while len(chars)<self.nchar:
867 l=lineiter.next()
868 chars+=''.join(l.split())
869 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
870 #first taxon has the reference sequence if matchhar is used
871 if taxcount==1:
872 refseq=iupac_seq
873 else:
874 if self.matchchar:
875 while 1:
876 p=iupac_seq.tostring().find(self.matchchar)
877 if p==-1:
878 break
879 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
880 #check for invalid characters
881 for i,c in enumerate(iupac_seq.tostring()):
882 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
883 raise NexusError( \
884 ('Taxon %s: Illegal character %s in sequence %s ' + \
885 '(check dimensions/interleaving)') % (id,c, iupac_seq))
886 #add sequence to matrix
887 if first_matrix_block:
888 self.unaltered_taxlabels.append(id)
889 id=_unique_label(self.matrix.keys(),id)
890 self.matrix[id]=iupac_seq
891 self.taxlabels.append(id)
892 else:
893 # taxon names need to be in the same order in each interleaved block
894 id=_unique_label(self.taxlabels[:taxcount-1],id)
895 taxon_present=self._check_taxlabels(id)
896 if taxon_present:
897 self.matrix[taxon_present]+=iupac_seq
898 else:
899 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id)
900 #check all sequences for length according to nchar
901 for taxon in self.matrix:
902 if len(self.matrix[taxon])!=self.nchar:
903 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \
904 % (self.nchar, len(self.matrix[taxon]),taxon))
905 #check that taxlabels is identical with matrix.keys. If not, it's a problem
906 matrixkeys=sorted(self.matrix)
907 taxlabelssort=self.taxlabels[:]
908 taxlabelssort.sort()
909 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
910
912 self.translate={}
913 opts=CharBuffer(options)
914 while True:
915 try:
916 # get id and state
917 identifier=int(opts.next_word())
918 label=quotestrip(opts.next_word())
919 self.translate[identifier]=label
920 # check for comma or end of command
921 c=opts.next_nonwhitespace()
922 if c is None:
923 break
924 elif c!=',':
925 raise NexusError('Missing \',\' in line %s.' % options)
926 except NexusError:
927 raise
928 except:
929 raise NexusError('Format error in line %s.' % options)
930
934
936 opts=CharBuffer(options)
937 if opts.peek_nonwhitespace()=='*': # a star can be used to make it the default tree in some software packages
938 dummy=opts.next_nonwhitespace()
939 name=opts.next_word()
940 if opts.next_nonwhitespace()!='=':
941 raise NexusError('Syntax error in tree description: %s' \
942 % options[:50])
943 rooted=False
944 weight=1.0
945 while opts.peek_nonwhitespace()=='[':
946 open=opts.next_nonwhitespace()
947 symbol=opts.next()
948 if symbol!='&':
949 raise NexusError('Illegal special comment [%s...] in tree description: %s' \
950 % (symbol, options[:50]))
951 special=opts.next()
952 value=opts.next_until(']')
953 closing=opts.next()
954 if special=='R':
955 rooted=True
956 elif special=='U':
957 rooted=False
958 elif special=='W':
959 weight=float(value)
960 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip())
961 # if there's an active translation table, translate
962 if self.translate:
963 for n in tree.get_terminals():
964 try:
965 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)])
966 except (ValueError,KeyError):
967 raise NexusError('Unable to substitue %s using \'translate\' data.' \
968 % tree.node(n).data.taxon)
969 self.trees.append(tree)
970
972 block=Block('')
973 block.title = title
974 for line in lines:
975 block.commandlines.append(Commandline(line, title))
976 self.structured.append(block)
977
981
983 name,sites=self._get_indices(options,set_type=CHARSET)
984 self.charsets[name]=_make_unique(sites)
985
987 taxpartition={}
988 quotelevel=False
989 opts=CharBuffer(options)
990 name=self._name_n_vector(opts)
991 if not name:
992 raise NexusError('Formatting error in taxpartition: %s ' % options)
993 # now collect thesubbpartitions and parse them
994 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
995 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words
996 sub=''
997 while True:
998 w=opts.next()
999 if w is None or (w==',' and not quotelevel):
1000 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
1001 taxpartition[subname]=_make_unique(subindices)
1002 sub=''
1003 if w is None:
1004 break
1005 else:
1006 if w=="'":
1007 quotelevel=not quotelevel
1008 sub+=w
1009 self.taxpartitions[name]=taxpartition
1010
1012 """Read codon positions from a codons block as written from McClade.
1013
1014 Here codonposset is just a fancy name for a character partition with
1015 the name CodonPositions and the partitions N,1,2,3
1016 """
1017
1018 prev_partitions=self.charpartitions
1019 self._charpartition(options)
1020 # mcclade calls it CodonPositions, but you never know...
1021 codonname=[n for n in self.charpartitions if n not in prev_partitions]
1022 if codonname==[] or len(codonname)>1:
1023 raise NexusError('Formatting Error in codonposset: %s ' % options)
1024 else:
1025 self.codonposset=codonname[0]
1026
1029
1031 charpartition={}
1032 quotelevel=False
1033 opts=CharBuffer(options)
1034 name=self._name_n_vector(opts)
1035 if not name:
1036 raise NexusError('Formatting error in charpartition: %s ' % options)
1037 # now collect thesubbpartitions and parse them
1038 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
1039 sub=''
1040 while True:
1041 w=opts.next()
1042 if w is None or (w==',' and not quotelevel):
1043 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
1044 charpartition[subname]=_make_unique(subindices)
1045 sub=''
1046 if w is None:
1047 break
1048 else:
1049 if w=="'":
1050 quotelevel=not quotelevel
1051 sub+=w
1052 self.charpartitions[name]=charpartition
1053
1055 """Parse the taxset/charset specification (PRIVATE).
1056
1057 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3'
1058 --> [0,1,2,3,4,'dog','cat',9,12,15,18]
1059 """
1060 opts=CharBuffer(options)
1061 name=self._name_n_vector(opts,separator=separator)
1062 indices=self._parse_list(opts,set_type=set_type)
1063 if indices is None:
1064 raise NexusError('Formatting error in line: %s ' % options)
1065 return name,indices
1066
1068 """Extract name and check that it's not in vector format."""
1069 rest=opts.rest()
1070 name=opts.next_word()
1071 # we ignore * before names
1072 if name=='*':
1073 name=opts.next_word()
1074 if not name:
1075 raise NexusError('Formatting error in line: %s ' % rest)
1076 name=quotestrip(name)
1077 if opts.peek_nonwhitespace=='(':
1078 open=opts.next_nonwhitespace()
1079 qualifier=open.next_word()
1080 close=opts.next_nonwhitespace()
1081 if qualifier.lower()=='vector':
1082 raise NexusError('Unsupported VECTOR format in line %s' \
1083 % (opts))
1084 elif qualifier.lower()!='standard':
1085 raise NexusError('Unknown qualifier %s in line %s' \
1086 % (qualifier, opts))
1087 if opts.next_nonwhitespace()!=separator:
1088 raise NexusError('Formatting error in line: %s ' % rest)
1089 return name
1090
1092 """Parse a NEXUS list (PRIVATE).
1093
1094 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21],
1095 (assuming dog is taxon no. 17 and cat is taxon no. 21).
1096 """
1097 plain_list=[]
1098 if options_buffer.peek_nonwhitespace():
1099 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError
1100 while True:
1101 identifier=options_buffer.next_word() # next list element
1102 if not identifier: # end of list?
1103 break
1104 start=self._resolve(identifier,set_type=set_type)
1105 if options_buffer.peek_nonwhitespace()=='-': # followd by -
1106 end=start
1107 step=1
1108 # get hyphen and end of range
1109 hyphen=options_buffer.next_nonwhitespace()
1110 end=self._resolve(options_buffer.next_word(),set_type=set_type)
1111 if set_type==CHARSET:
1112 if options_buffer.peek_nonwhitespace()=='\\': # followd by \
1113 backslash=options_buffer.next_nonwhitespace()
1114 step=int(options_buffer.next_word()) # get backslash and step
1115 plain_list.extend(range(start,end+1,step))
1116 else:
1117 if type(start)==list or type(end)==list:
1118 raise NexusError('Name if character sets not allowed in range definition: %s'\
1119 % identifier)
1120 start=self.taxlabels.index(start)
1121 end=self.taxlabels.index(end)
1122 taxrange=self.taxlabels[start:end+1]
1123 plain_list.extend(taxrange)
1124 else:
1125 if type(start)==list: # start was the name of charset or taxset
1126 plain_list.extend(start)
1127 else: # start was an ordinary identifier
1128 plain_list.append(start)
1129 except NexusError:
1130 raise
1131 except:
1132 return None
1133 return plain_list
1134
1136 """Translate identifier in list into character/taxon index.
1137
1138 Characters (which are referred to by their index in Nexus.py):
1139 Plain numbers are returned minus 1 (Nexus indices to python indices)
1140 Text identifiers are translaterd into their indices (if plain character indentifiers),
1141 the first hit in charlabels is returned (charlabels don't need to be unique)
1142 or the range of indices is returned (if names of character sets).
1143 Taxa (which are referred to by their unique name in Nexus.py):
1144 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1145 Names are returned unchanged (if plain taxon identifiers), or the names in
1146 the corresponding taxon set is returned
1147 """
1148 identifier=quotestrip(identifier)
1149 if not set_type:
1150 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
1151 if set_type==CHARSET:
1152 try:
1153 n=int(identifier)
1154 except ValueError:
1155 if self.charlabels and identifier in self.charlabels.itervalues():
1156 for k in self.charlabels:
1157 if self.charlabels[k]==identifier:
1158 return k
1159 elif self.charsets and identifier in self.charsets:
1160 return self.charsets[identifier]
1161 else:
1162 raise NexusError('Unknown character identifier: %s' \
1163 % identifier)
1164 else:
1165 if n<=self.nchar:
1166 return n-1
1167 else:
1168 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \
1169 % (identifier,self.nchar))
1170 elif set_type==TAXSET:
1171 try:
1172 n=int(identifier)
1173 except ValueError:
1174 taxlabels_id=self._check_taxlabels(identifier)
1175 if taxlabels_id:
1176 return taxlabels_id
1177 elif self.taxsets and identifier in self.taxsets:
1178 return self.taxsets[identifier]
1179 else:
1180 raise NexusError('Unknown taxon identifier: %s' \
1181 % identifier)
1182 else:
1183 if n>0 and n<=self.ntax:
1184 return self.taxlabels[n-1]
1185 else:
1186 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \
1187 % (identifier,self.ntax))
1188 else:
1189 raise NexusError('Unknown set specification: %s.'% set_type)
1190
1194
1198
1202
1206
1207 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
1208 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1209 """Writes a nexus file for each partition in charpartition.
1210
1211 Only non-excluded characters and non-deleted taxa are included,
1212 just the data block is written.
1213 """
1214
1215 if not matrix:
1216 matrix=self.matrix
1217 if not matrix:
1218 return
1219 if not filename:
1220 filename=self.filename
1221 if charpartition:
1222 pfilenames={}
1223 for p in charpartition:
1224 total_exclude=[]+exclude
1225 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
1226 total_exclude=_make_unique(total_exclude)
1227 pcomment=comment+'\nPartition: '+p+'\n'
1228 dot=filename.rfind('.')
1229 if dot>0:
1230 pfilename=filename[:dot]+'_'+p+'.data'
1231 else:
1232 pfilename=filename+'_'+p
1233 pfilenames[p]=pfilename
1234 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
1235 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
1236 mrbayes=mrbayes)
1237 return pfilenames
1238 else:
1239 fn=self.filename+'.data'
1240 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
1241 exclude=exclude,delete=delete,comment=comment,append_sets=False,
1242 mrbayes=mrbayes)
1243 return fn
1244
1245 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
1246 blocksize=None, interleave=False, interleave_by_partition=False,\
1247 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\
1248 codons_block=True):
1249 """Writes a nexus file with data and sets block to a file or handle.
1250
1251 Character sets and partitions are appended by default, and are
1252 adjusted according to excluded characters (i.e. character sets
1253 still point to the same sites (not necessarily same positions),
1254 without including the deleted characters.
1255
1256 filename - Either a filename as a string (which will be opened,
1257 written to and closed), or a handle object (which will
1258 be written to but NOT closed).
1259 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the
1260 start of the file is ommited.
1261
1262 Returns the filename/handle used to write the data.
1263 """
1264 if not matrix:
1265 matrix=self.matrix
1266 if not matrix:
1267 return
1268 if not filename:
1269 filename=self.filename
1270 if [t for t in delete if not self._check_taxlabels(t)]:
1271 raise NexusError('Unknown taxa: %s' \
1272 % ', '.join(set(delete).difference(set(self.taxlabels))))
1273 if interleave_by_partition:
1274 if not interleave_by_partition in self.charpartitions:
1275 raise NexusError('Unknown partition: '+interleave_by_partition)
1276 else:
1277 partition=self.charpartitions[interleave_by_partition]
1278 # we need to sort the partition names by starting position before we exclude characters
1279 names=_sort_keys_by_values(partition)
1280 newpartition={}
1281 for p in partition:
1282 newpartition[p]=[c for c in partition[p] if c not in exclude]
1283 # how many taxa and how many characters are left?
1284 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
1285 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
1286 ntax_adjusted=len(undelete)
1287 nchar_adjusted=len(cropped_matrix[undelete[0]])
1288 if not undelete or (undelete and undelete[0]==''):
1289 return
1290 if isinstance(filename,basestring):
1291 try:
1292 fh=open(filename,'w',encoding="utf-8")
1293 except IOError:
1294 raise NexusError('Could not open %s for writing.' % filename)
1295 elif hasattr(filename, 'write'):
1296 #e.g. StringIO or a real file handle
1297 fh=filename
1298 else:
1299 raise ValueError("Neither a filename nor a handle was supplied")
1300 if not omit_NEXUS:
1301 fh.write('#NEXUS\n')
1302 if comment:
1303 fh.write('['+comment+']\n')
1304 fh.write('begin data;\n')
1305 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
1306 fh.write('\tformat datatype='+self.datatype)
1307 if self.respectcase:
1308 fh.write(' respectcase')
1309 if self.missing:
1310 fh.write(' missing='+self.missing)
1311 if self.gap:
1312 fh.write(' gap='+self.gap)
1313 if self.matchchar:
1314 fh.write(' matchchar='+self.matchchar)
1315 if self.labels:
1316 fh.write(' labels='+self.labels)
1317 if self.equate:
1318 fh.write(' equate='+self.equate)
1319 if interleave or interleave_by_partition:
1320 fh.write(' interleave')
1321 fh.write(';\n')
1322 #if self.taxlabels:
1323 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n')
1324 if self.charlabels:
1325 newcharlabels=self._adjust_charlabels(exclude=exclude)
1326 clkeys=sorted(newcharlabels)
1327 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
1328 fh.write('matrix\n')
1329 if not blocksize:
1330 if interleave:
1331 blocksize=70
1332 else:
1333 blocksize=self.nchar
1334 # delete deleted taxa and ecxclude excluded characters...
1335 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
1336 if interleave_by_partition:
1337 # interleave by partitions, but adjust partitions with regard to excluded characters
1338 seek=0
1339 for p in names:
1340 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
1341 if len(newpartition[p])>0:
1342 for taxon in undelete:
1343 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1344 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
1345 fh.write('\n')
1346 else:
1347 fh.write('[empty]\n\n')
1348 seek+=len(newpartition[p])
1349 elif interleave:
1350 for seek in range(0,nchar_adjusted,blocksize):
1351 for taxon in undelete:
1352 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1353 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1354 fh.write('\n')
1355 else:
1356 for taxon in undelete:
1357 if blocksize<nchar_adjusted:
1358 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
1359 else:
1360 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1361 for seek in range(0,nchar_adjusted,blocksize):
1362 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1363 fh.write(';\nend;\n')
1364 if append_sets:
1365 if codons_block:
1366 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False))
1367 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True))
1368 else:
1369 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
1370
1371 if fh == filename:
1372 #We were given the handle, don't close it.
1373 return filename
1374 else:
1375 #We opened the handle, so we should close it.
1376 fh.close()
1377 return filename
1378
1379 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1380 """Returns a sets block."""
1381 if not self.charsets and not self.taxsets and not self.charpartitions:
1382 return ''
1383 if codons_only:
1384 setsb=['\nbegin codons']
1385 else:
1386 setsb=['\nbegin sets']
1387 # - now if characters have been excluded, the character sets need to be adjusted,
1388 # so that they still point to the right character positions
1389 # calculate a list of offsets: for each deleted character, the following character position
1390 # in the new file will have an additional offset of -1
1391 offset=0
1392 offlist=[]
1393 for c in range(self.nchar):
1394 if c in exclude:
1395 offset+=1
1396 offlist.append(-1) # dummy value as these character positions are excluded
1397 else:
1398 offlist.append(c-offset)
1399 # now adjust each of the character sets
1400 if not codons_only:
1401 for n,ns in self.charsets.iteritems():
1402 cset=[offlist[c] for c in ns if c not in exclude]
1403 if cset:
1404 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
1405 for n,s in self.taxsets.iteritems():
1406 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
1407 if tset:
1408 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset)))
1409 for n,p in self.charpartitions.iteritems():
1410 if not include_codons and n==CODONPOSITIONS:
1411 continue
1412 elif codons_only and n!=CODONPOSITIONS:
1413 continue
1414 # as characters have been excluded, the partitions must be adjusted
1415 # if a partition is empty, it will be omitted from the charpartition command
1416 # (although paup allows charpartition part=t1:,t2:,t3:1-100)
1417 names=_sort_keys_by_values(p)
1418 newpartition={}
1419 for sn in names:
1420 nsp=[offlist[c] for c in p[sn] if c not in exclude]
1421 if nsp:
1422 newpartition[sn]=nsp
1423 if newpartition:
1424 if include_codons and n==CODONPOSITIONS:
1425 command='codonposset'
1426 else:
1427 command='charpartition'
1428 setsb.append('%s %s = %s' % (command,safename(n),\
1429 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
1430 # now write charpartititions, much easier than charpartitions
1431 for n,p in self.taxpartitions.iteritems():
1432 names=_sort_keys_by_values(p)
1433 newpartition={}
1434 for sn in names:
1435 nsp=[t for t in p[sn] if t not in delete]
1436 if nsp:
1437 newpartition[sn]=nsp
1438 if newpartition:
1439 setsb.append('taxpartition %s = %s' % (safename(n),\
1440 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
1441 # add 'end' and return everything
1442 setsb.append('end;\n')
1443 if len(setsb)==2: # begin and end only
1444 return ''
1445 else:
1446 return ';\n'.join(setsb)
1447
1449 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1450 if not filename:
1451 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1452 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
1453 else:
1454 filename=self.filename+'.fas'
1455 fh=open(filename,'w')
1456 for taxon in self.taxlabels:
1457 fh.write('>'+safename(taxon)+'\n')
1458 for i in range(0, len(self.matrix[taxon].tostring()), width):
1459 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
1460 fh.close()
1461 return filename
1462
1464 """Writes matrix into a PHYLIP file: (self, filename=None).
1465
1466 Note that this writes a relaxed PHYLIP format file, where the names
1467 are not truncated, nor checked for invalid characters."""
1468 if not filename:
1469 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1470 filename='.'.join(self.filename.split('.')[:-1])+'.phy'
1471 else:
1472 filename=self.filename+'.phy'
1473 fh=open(filename,'w')
1474 fh.write('%d %d\n' % (self.ntax,self.nchar))
1475 for taxon in self.taxlabels:
1476 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring()))
1477 fh.close()
1478 return filename
1479
1481 """Return a list with all constant characters."""
1482 if not matrix:
1483 matrix=self.matrix
1484 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1485 if not undelete:
1486 return None
1487 elif len(undelete)==1:
1488 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1489 # get the first sequence and expand all ambiguous values
1490 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
1491 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
1492 for taxon in undelete[1:]:
1493 newconstant=[]
1494 for site in constant:
1495 #print '%d (paup=%d)' % (site[0],site[0]+1),
1496 seqsite=matrix[taxon][site[0]].upper()
1497 #print seqsite,'checked against',site[1],'\t',
1498 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
1499 # missing or same as before -> ok
1500 newconstant.append(site)
1501 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
1502 # subset of an ambig or only missing in previous -> take subset
1503 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
1504 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values
1505 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1]))
1506 if intersect:
1507 newconstant.append((site[0],''.join(intersect)))
1508 # print 'ok'
1509 #else:
1510 # print 'failed'
1511 #else:
1512 # print 'failed'
1513 constant=newconstant
1514 cpos=[s[0] for s in constant]
1515 return cpos
1516
1518 """Summarize character.
1519
1520 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
1521 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
1522 """
1523 undelete=[t for t in self.taxlabels if t not in delete]
1524 if not undelete:
1525 return None
1526 cstatus=[]
1527 for t in undelete:
1528 c=self.matrix[t][site].upper()
1529 if self.options.get('gapmode')=='missing' and c==self.gap:
1530 c=self.missing
1531 if narrow and c==self.missing:
1532 if c not in cstatus:
1533 cstatus.append(c)
1534 else:
1535 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
1536 if self.missing in cstatus and narrow and len(cstatus)>1:
1537 cstatus=[c for c in cstatus if c!=self.missing]
1538 cstatus.sort()
1539 return cstatus
1540
1542 """Calculates a stepmatrix for weighted parsimony.
1543
1544 See Wheeler (1990), Cladistics 6:269-275 and
1545 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
1546 """
1547 m=StepMatrix(self.unambiguous_letters,self.gap)
1548 for site in [s for s in range(self.nchar) if s not in exclude]:
1549 cstatus=self.cstatus(site,delete)
1550 for i,b1 in enumerate(cstatus[:-1]):
1551 for b2 in cstatus[i+1:]:
1552 m.add(b1.upper(),b2.upper(),1)
1553 return m.transformation().weighting().smprint(name=name)
1554
1556 """Return a matrix without deleted taxa and excluded characters."""
1557 if not matrix:
1558 matrix=self.matrix
1559 if [t for t in delete if not self._check_taxlabels(t)]:
1560 raise NexusError('Unknown taxa: %s' \
1561 % ', '.join(set(delete).difference(self.taxlabels)))
1562 if exclude!=[]:
1563 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1564 if not undelete:
1565 return {}
1566 m=[matrix[k].tostring() for k in undelete]
1567 zipped_m=zip(*m)
1568 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
1569 if sitesm==[]:
1570 return dict([(t,Seq('',self.alphabet)) for t in undelete])
1571 else:
1572 zipped_sitesm=zip(*sitesm)
1573 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
1574 return dict(zip(undelete,m))
1575 else:
1576 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1577
1579 """Return a bootstrapped matrix."""
1580 if not matrix:
1581 matrix=self.matrix
1582 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects
1583 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out
1584 if not cm: # everything deleted?
1585 return {}
1586 elif len(cm[cm.keys()[0]])==0: # everything excluded?
1587 return cm
1588 undelete=[t for t in self.taxlabels if t in cm]
1589 if seqobjects:
1590 sitesm=zip(*[cm[t].tostring() for t in undelete])
1591 alphabet=matrix[matrix.keys()[0]].alphabet
1592 else:
1593 sitesm=zip(*[cm[t] for t in undelete])
1594 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
1595 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
1596 if seqobjects:
1597 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
1598 return dict(zip(undelete,bootstrapseqs))
1599
1601 """Adds a sequence (string) to the matrix."""
1602
1603 if not name:
1604 raise NexusError('New sequence must have a name')
1605
1606 diff=self.nchar-len(sequence)
1607 if diff<0:
1608 self.insert_gap(self.nchar,-diff)
1609 elif diff>0:
1610 sequence+=self.missing*diff
1611
1612 if name in self.taxlabels:
1613 unique_name=_unique_label(self.taxlabels,name)
1614 #print "WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name)
1615 else:
1616 unique_name=name
1617
1618 assert unique_name not in self.matrix, "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug."
1619
1620 self.matrix[unique_name]=Seq(sequence,self.alphabet)
1621 self.ntax+=1
1622 self.taxlabels.append(unique_name)
1623 self.unaltered_taxlabels.append(name)
1624
1626 """Add a gap into the matrix and adjust charsets and partitions.
1627
1628 pos=0: first position
1629 pos=nchar: last position
1630 """
1631
1632 def _adjust(set,x,d,leftgreedy=False):
1633 """Adjusts chartacter sets if gaps are inserted, taking care of
1634 new gaps within a coherent character set."""
1635 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15
1636 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18
1637 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18
1638 set.sort()
1639 addpos=0
1640 for i,c in enumerate(set):
1641 if c>=x:
1642 set[i]=c+d
1643 # if we add gaps within a group of characters, we want the gap position included in this group
1644 if c==x:
1645 if leftgreedy or (i>0 and set[i-1]==c-1):
1646 addpos=i
1647 if addpos>0:
1648 set[addpos:addpos]=range(x,x+d)
1649 return set
1650
1651 if pos<0 or pos>self.nchar:
1652 raise NexusError('Illegal gap position: %d' % pos)
1653 if n==0:
1654 return
1655 if self.taxlabels:
1656 #python 2.3 does not support zip(*[])
1657 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1658 else:
1659 sitesm=[]
1660 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
1661 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\
1662 # i,taxon in enumerate(self.taxlabels)])
1663 zipped=zip(*sitesm)
1664 mapped=map(''.join,zipped)
1665 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
1666 self.matrix=dict(listed)
1667 self.nchar+=n
1668 # now adjust character sets
1669 for i,s in self.charsets.iteritems():
1670 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1671 for p in self.charpartitions:
1672 for sp,s in self.charpartitions[p].iteritems():
1673 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1674 # now adjust character state labels
1675 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
1676 return self.charlabels
1677
1679 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
1680 if exclude and insert:
1681 raise NexusError('Can\'t exclude and insert at the same time')
1682 if not self.charlabels:
1683 return None
1684 labels=sorted(self.charlabels)
1685 newcharlabels={}
1686 if exclude:
1687 exclude.sort()
1688 exclude.append(sys.maxint)
1689 excount=0
1690 for c in labels:
1691 if not c in exclude:
1692 while c>exclude[excount]:
1693 excount+=1
1694 newcharlabels[c-excount]=self.charlabels[c]
1695 elif insert:
1696 insert.sort()
1697 insert.append(sys.maxint)
1698 icount=0
1699 for c in labels:
1700 while c>=insert[icount]:
1701 icount+=1
1702 newcharlabels[c+icount]=self.charlabels[c]
1703 else:
1704 return self.charlabels
1705 return newcharlabels
1706
1708 """Returns all character indices that are not in charlist."""
1709 return [c for c in range(self.nchar) if c not in charlist]
1710
1712 """Return gap-only sites."""
1713 gap=set(self.gap)
1714 if include_missing:
1715 gap.add(self.missing)
1716 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1717 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)]
1718 return gaponly
1719
1721 """Replaces all terminal gaps with missing character.
1722
1723 Mixtures like ???------??------- are properly resolved."""
1724
1725 if not missing:
1726 missing=self.missing
1727 replace=[self.missing,self.gap]
1728 if not skip_n:
1729 replace.extend(['n','N'])
1730 for taxon in self.taxlabels:
1731 sequence=self.matrix[taxon].tostring()
1732 length=len(sequence)
1733 start,end=get_start_end(sequence,skiplist=replace)
1734 if start==-1 and end==-1:
1735 sequence=missing*length
1736 else:
1737 sequence=sequence[:end+1]+missing*(length-end-1)
1738 sequence=start*missing+sequence[start:]
1739 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon
1740 self.matrix[taxon]=Seq(sequence,self.alphabet)
1741
| Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Fri Nov 26 16:20:08 2010 | http://epydoc.sourceforge.net |