| Trees | Indices | Help |
|---|
|
|
1 #!/usr/bin/env python
2 #
3 # Restriction Analysis Libraries.
4 # Copyright (C) 2004. Frederic Sohm.
5 #
6 # This code is part of the Biopython distribution and governed by its
7 # license. Please see the LICENSE file that should have been included
8 # as part of this package.
9 #
10 # this script is used to produce the dictionary which will contains the data
11 # about the restriction enzymes from the Emboss/Rebase data files
12 # namely
13 # emboss_e.### (description of the sites),
14 # emboss_r.### (origin, methylation, references)
15 # emboss_s.### (suppliers)
16 # where ### is a number of three digits : 1 for the year two for the month
17 #
18 # very dirty implementation but it does the job, so...
19 # Not very quick either but you are not supposed to use it frequently.
20 #
21 # The results are stored in
22 # path/to/site-packages/Bio/Restriction/Restriction_Dictionary.py
23 # the file contains two dictionary:
24 # 'rest_dict' which contains the data for the enzymes
25 # and
26 # 'suppliers' which map the name of the suppliers to their abbreviation.
27 #
28
29 """Convert a serie of Rebase files into a Restriction_Dictionary.py module.
30
31 The Rebase files are in the emboss format:
32
33 emboss_e.### -> contains informations about the restriction sites.
34 emboss_r.### -> contains general informations about the enzymes.
35 emboss_s.### -> contains informations about the suppliers.
36
37 ### is a 3 digit number. The first digit is the year and the two last the month.
38 """
39
40 import sre
41 import os
42 import itertools
43 import time
44 import sys
45 import site
46 import shutil
47
48 from Bio.Seq import Seq
49 from Bio.Alphabet import IUPAC
50
51 import Bio.Restriction.Restriction
52 from Bio.Restriction.Restriction import AbstractCut, RestrictionType, NoCut, OneCut,\
53 TwoCuts, Meth_Dep, Meth_Undep, Palindromic, NonPalindromic, Unknown, Blunt,\
54 Ov5, Ov3, NotDefined, Defined, Ambiguous, Commercially_available, Not_available
55
56 import Bio.Restriction.RanaConfig as config
57 from Bio.Restriction._Update.Update import RebaseUpdate
58 from Bio.Restriction.Restriction import *
59 from Bio.Restriction.DNAUtils import antiparallel
60
61 DNA=Seq
62 dna_alphabet = {'A':'A', 'C':'C', 'G':'G', 'T':'T',
63 'R':'AG', 'Y':'CT', 'W':'AT', 'S':'CG', 'M':'AC', 'K':'GT',
64 'H':'ACT', 'B':'CGT', 'V':'ACG', 'D':'AGT',
65 'N':'ACGT',
66 'a': 'a', 'c': 'c', 'g': 'g', 't': 't',
67 'r':'ag', 'y':'ct', 'w':'at', 's':'cg', 'm':'ac', 'k':'gt',
68 'h':'act', 'b':'cgt', 'v':'acg', 'd':'agt',
69 'n':'acgt'}
70
71
72 complement_alphabet = {'A':'T', 'T':'A', 'C':'G', 'G':'C','R':'Y', 'Y':'R',
73 'W':'W', 'S':'S', 'M':'K', 'K':'M', 'H':'D', 'D':'H',
74 'B':'V', 'V':'B', 'N':'N','a':'t', 'c':'g', 'g':'c',
75 't':'a', 'r':'y', 'y':'r', 'w':'w', 's':'s','m':'k',
76 'k':'m', 'h':'d', 'd':'h', 'b':'v', 'v':'b', 'n':'n'}
77 enzymedict = {}
78 suppliersdict = {}
79 classdict = {}
80 typedict = {}
81
82
86
88 """BaseExpand(base) -> string.
89
90 given a degenerated base, returns its meaning in IUPAC alphabet.
91
92 i.e:
93 b= 'A' -> 'A'
94 b= 'N' -> 'ACGT'
95 etc..."""
96 base = base.upper()
97 return dna_alphabet[base]
98
100 """regex(site) -> string.
101
102 Construct a regular expression from a DNA sequence.
103 i.e.:
104 site = 'ABCGN' -> 'A[CGT]CG.'"""
105 reg_ex = site
106 for base in reg_ex:
107 if base in ('A', 'T', 'C', 'G', 'a', 'c', 'g', 't'):
108 pass
109 if base in ('N', 'n'):
110 reg_ex = '.'.join(reg_ex.split('N'))
111 reg_ex = '.'.join(reg_ex.split('n'))
112 if base in ('R', 'Y', 'W', 'M', 'S', 'K', 'H', 'D', 'B', 'V'):
113 expand = '['+ str(BaseExpand(base))+']'
114 reg_ex = expand.join(reg_ex.split(base))
115 return reg_ex
116
118 """Antiparallel(sequence) -> string.
119
120 returns a string which represents the reverse complementary strand of
121 a DNA sequence."""
122 return antiparallel(sequence.tostring())
123
125 """is_palindrom(sequence) -> bool.
126
127 True is the sequence is a palindrom.
128 sequence is a DNA object."""
129 return sequence == DNA(Antiparallel(sequence))
130
132 """LocalTime() -> string.
133
134 LocalTime calculate the extension for emboss file for the current year and
135 month."""
136 t = time.gmtime()
137 year = str(t.tm_year)[-1]
138 month = str(t.tm_mon)
139 if len(month) == 1 : month = '0'+month
140 return year+month
141
142
144 """construct the attributes of the enzyme corresponding to 'name'."""
146 cls.opt_temp = 37
147 cls.inact_temp = 65
148 cls.substrat = 'DNA'
149 target = enzymedict[name]
150 cls.site = target[0]
151 cls.size = target[1]
152 cls.suppl = tuple(target[9])
153 cls.freq = target[11]
154 cls.ovhg = target[13]
155 cls.ovhgseq = target[14]
156 cls.bases = ()
157 #
158 # Is the site palindromic?
159 # Important for the way the DNA is search for the site.
160 # Palindromic sites needs to be looked for only over 1 strand.
161 # Non Palindromic needs to be search for on the reverse complement
162 # as well.
163 #
164 if target[10] : cls.bases += ('Palindromic',)
165 else : cls.bases += ('NonPalindromic',)
166 #
167 # Number of cut the enzyme produce.
168 # 0 => unknown, the enzyme has not been fully characterised.
169 # 2 => 1 cut, (because one cut is realised by cutting 2 strands
170 # 4 => 2 cuts, same logic.
171 # A little bit confusing but it is the way EMBOSS/Rebase works.
172 #
173 if not target[2]:
174 #
175 # => undefined enzymes, nothing to be done.
176 #
177 cls.bases += ('NoCut','Unknown', 'NotDefined')
178 cls.fst5 = None
179 cls.fst3 = None
180 cls.scd5 = None
181 cls.scd3 = None
182 cls.ovhg = None
183 cls.ovhgseq = None
184 else:
185 #
186 # we will need to calculate the overhang.
187 #
188 if target[2] == 2:
189 cls.bases += ('OneCut',)
190 cls.fst5 = target[4]
191 cls.fst3 = target[5]
192 cls.scd5 = None
193 cls.scd3 = None
194 else:
195 cls.bases += ('TwoCuts',)
196 cls.fst5 = target[4]
197 cls.fst3 = target[5]
198 cls.scd5 = target[6]
199 cls.scd3 = target[7]
200 #
201 # Now, prepare the overhangs which will be added to the DNA
202 # after the cut.
203 # Undefined enzymes will not be allowed to catalyse,
204 # they are not available commercially anyway.
205 # I assumed that if an enzyme cut twice the overhang will be of
206 # the same kind. The only exception is HaeIV. I do not deal
207 # with that at the moment (ie I don't include it,
208 # need to be fixed).
209 # They generally cut outside their recognition site and
210 # therefore the overhang is undetermined and dependent of
211 # the DNA sequence upon which the enzyme act.
212 #
213 if target[3]:
214 #
215 # rebase field for blunt: blunt == 1, other == 0.
216 # The enzyme is blunt. No overhang.
217 #
218 cls.bases += ('Blunt', 'Defined')
219 cls.ovhg = 0
220 elif isinstance(cls.ovhg, int):
221 #
222 # => overhang is sequence dependent
223 #
224 if cls.ovhg > 0:
225 #
226 # 3' overhang, ambiguous site (outside recognition site
227 # or site containing ambiguous bases (N, W, R,...)
228 #
229 cls.bases += ('Ov3', 'Ambiguous')
230 elif cls.ovhg < 0:
231 #
232 # 5' overhang, ambiguous site (outside recognition site
233 # or site containing ambiguous bases (N, W, R,...)
234 #
235 cls.bases += ('Ov5', 'Ambiguous')
236 else:
237 #
238 # cls.ovhg is a string => overhang is constant
239 #
240 if cls.fst5 - (cls.fst3 + cls.size) < 0:
241 cls.bases += ('Ov5', 'Defined')
242 cls.ovhg = - len(cls.ovhg)
243 else:
244 cls.bases += ('Ov3', 'Defined')
245 cls.ovhg = + len(cls.ovhg)
246 #
247 # Next class : sensibility to methylation.
248 # Set by EmbossMixer from emboss_r.txt file
249 # Not really methylation dependent at the moment, stands rather for
250 # 'is the site methylable?'.
251 # Proper methylation sensibility has yet to be implemented.
252 # But the class is there for further development.
253 #
254 if target[8]:
255 cls.bases += ('Meth_Dep', )
256 cls.compsite = target[12]
257 else:
258 cls.bases += ('Meth_Undep',)
259 cls.compsite = target[12]
260 #
261 # Next class will allow to select enzymes in function of their
262 # suppliers. Not essential but can be useful.
263 #
264 if cls.suppl:
265 cls.bases += ('Commercially_available', )
266 else:
267 cls.bases += ('Not_available', )
268 cls.bases += ('AbstractCut', 'RestrictionType')
269 cls.__name__ = name
270 cls.results = None
271 cls.dna = None
272 cls.__bases__ = cls.bases
273 cls.charac = (cls.fst5, cls.fst3, cls.scd5, cls.scd3, cls.site)
274 if not target[2] and cls.suppl:
275 supp = ', '.join([suppliersdict[s][0] for s in cls.suppl])
276 print 'WARNING : It seems that %s is both commercially available\
277 \n\tand its characteristics are unknown. \
278 \n\tThis seems counter-intuitive.\
279 \n\tThere is certainly an error either in ranacompiler or\
280 \n\tin this REBASE release.\
281 \n\tThe supplier is : %s.' % (name, supp)
282 return
283
284
286
287 """Build the different types possible for Restriction Enzymes"""
288
292
294 """TC.buildtype() -> generator.
295
296 build the new types that will be needed for constructing the
297 restriction enzymes."""
298 baT = (AbstractCut, RestrictionType)
299 cuT = (NoCut, OneCut, TwoCuts)
300 meT = (Meth_Dep, Meth_Undep)
301 paT = (Palindromic, NonPalindromic)
302 ovT = (Unknown, Blunt, Ov5, Ov3)
303 deT = (NotDefined, Defined, Ambiguous)
304 coT = (Commercially_available, Not_available)
305 All = (baT, cuT, meT, paT, ovT, deT, coT)
306 #
307 # Now build the types. Only the most obvious are left out.
308 # Modified even the most obvious are not so obvious.
309 # emboss_*.403 AspCNI is unknown and commercially available.
310 # So now do not remove the most obvious.
311 #
312 types = [(p,c,o,d,m,co,baT[0],baT[1])
313 for p in paT for c in cuT for o in ovT
314 for d in deT for m in meT for co in coT]
315 n= 1
316 for ty in types:
317 dct = {}
318 for t in ty:
319 dct.update(t.__dict__)
320 #
321 # here we need to customize the dictionary.
322 # i.e. types deriving from OneCut have always scd5 and scd3
323 # equal to None. No need therefore to store that in a specific
324 # enzyme of this type. but it then need to be in the type.
325 #
326 dct['results'] = []
327 dct['substrat'] = 'DNA'
328 dct['dna'] = None
329 if t == NoCut:
330 dct.update({'fst5':None,'fst3':None,
331 'scd5':None,'scd3':None,
332 'ovhg':None,'ovhgseq':None})
333 elif t == OneCut:
334 dct.update({'scd5':None, 'scd3':None})
335 class klass(type):
336 def __new__(cls):
337 return type.__new__(cls, 'type%i'%n,ty,dct)
338 def __init__(cls):
339 super(klass, cls).__init__('type%i'%n,ty,dct)
340 yield klass()
341 n+=1
342
343 start = '\n\
344 #!/usr/bin/env python\n\
345 #\n\
346 # Restriction Analysis Libraries.\n\
347 # Copyright (C) 2004. Frederic Sohm.\n\
348 #\n\
349 # This code is part of the Biopython distribution and governed by its\n\
350 # license. Please see the LICENSE file that should have been included\n\
351 # as part of this package.\n\
352 #\n\
353 # This file is automatically generated - do not edit it by hand! Instead,\n\
354 # use the tool Scripts/Restriction/ranacompiler.py which in turn uses\n\
355 # Bio/Restriction/_Update/RestrictionCompiler.py\n\
356 #\n\
357 # The following dictionaries used to be defined in one go, but that does\n\
358 # not work on Jython due to JVM limitations. Therefore we break this up\n\
359 # into steps, using temporary functions to avoid the JVM limits.\n\
360 \n\n'
361
363
365 """DictionaryBuilder([e_mail[, ftp_proxy]) -> DictionaryBuilder instance.
366
367 If the emboss files used for the construction need to be updated this
368 class will download them if the ftp connection is correctly set.
369 either in RanaConfig.py or given at run time.
370
371 e_mail is the e-mail address used as password for the anonymous
372 ftp connection.
373
374 proxy is the ftp_proxy to use if any."""
375 self.rebase_pass = e_mail or config.Rebase_password
376 self.proxy = ftp_proxy or config.ftp_proxy
377
379 """DB.build_dict() -> None.
380
381 Construct the dictionary and build the files containing the new
382 dictionaries."""
383 #
384 # first parse the emboss files.
385 #
386 emboss_e, emboss_r, emboss_s = self.lastrebasefile()
387 #
388 # the results will be stored into enzymedict.
389 #
390 self.information_mixer(emboss_r, emboss_e, emboss_s)
391 emboss_r.close()
392 emboss_e.close()
393 emboss_s.close()
394 #
395 # we build all the possible type
396 #
397 tdct = {}
398 for klass in TypeCompiler().buildtype():
399 exec klass.__name__ +'= klass'
400 exec "tdct['"+klass.__name__+"'] = klass"
401
402 #
403 # Now we build the enzymes from enzymedict
404 # and store them in a dictionary.
405 # The type we will need will also be stored.
406 #
407
408 for name in enzymedict:
409 #
410 # the class attributes first:
411 #
412 cls = newenzyme(name)
413 #
414 # Now select the right type for the enzyme.
415 #
416 bases = cls.bases
417 clsbases = tuple([eval(x) for x in bases])
418 typestuff = ''
419 for n, t in tdct.iteritems():
420 #
421 # if the bases are the same. it is the right type.
422 # create the enzyme and remember the type
423 #
424 if t.__bases__ == clsbases:
425 typestuff = t
426 typename = t.__name__
427 continue
428 #
429 # now we build the dictionaries.
430 #
431 dct = dict(cls.__dict__)
432 del dct['bases']
433 del dct['__bases__']
434 del dct['__name__']# no need to keep that, it's already in the type.
435 classdict[name] = dct
436
437 commonattr = ['fst5', 'fst3', 'scd5', 'scd3', 'substrat',
438 'ovhg', 'ovhgseq','results', 'dna']
439 if typename in typedict:
440 typedict[typename][1].append(name)
441 else:
442 enzlst= []
443 tydct = dict(typestuff.__dict__)
444 tydct = dict([(k,v) for k,v in tydct.iteritems() if k in commonattr])
445 enzlst.append(name)
446 typedict[typename] = (bases, enzlst)
447 for letter in cls.__dict__['suppl']:
448 supplier = suppliersdict[letter]
449 suppliersdict[letter][1].append(name)
450 if not classdict or not suppliersdict or not typedict:
451 print 'One of the new dictionaries is empty.'
452 print 'Check the integrity of the emboss file before continuing.'
453 print 'Update aborted.'
454 sys.exit()
455 #
456 # How many enzymes this time?
457 #
458 print '\nThe new database contains %i enzymes.\n' % len(classdict)
459 #
460 # the dictionaries are done. Build the file
461 #
462 #update = config.updatefolder
463
464 update = os.getcwd()
465 results = open(os.path.join(update, 'Restriction_Dictionary.py'), 'w')
466 print 'Writing the dictionary containing the new Restriction classes.\t',
467 results.write(start)
468 results.write('rest_dict = {}\n')
469 for name in sorted(classdict) :
470 results.write("def _temp():\n")
471 results.write(" return {\n")
472 for key, value in classdict[name].iteritems() :
473 results.write(" %s : %s,\n" % (repr(key), repr(value)))
474 results.write(" }\n")
475 results.write("rest_dict[%s] = _temp()\n" % repr(name))
476 results.write("\n")
477 print 'OK.\n'
478 print 'Writing the dictionary containing the suppliers datas.\t\t',
479 results.write('suppliers = {}\n')
480 for name in sorted(suppliersdict) :
481 results.write("def _temp():\n")
482 results.write(" return (\n")
483 for value in suppliersdict[name] :
484 results.write(" %s,\n" % repr(value))
485 results.write(" )\n")
486 results.write("suppliers[%s] = _temp()\n" % repr(name))
487 results.write("\n")
488 print 'OK.\n'
489 print 'Writing the dictionary containing the Restriction types.\t',
490 results.write('typedict = {}\n')
491 for name in sorted(typedict) :
492 results.write("def _temp():\n")
493 results.write(" return (\n")
494 for value in typedict[name] :
495 results.write(" %s,\n" % repr(value))
496 results.write(" )\n")
497 results.write("typedict[%s] = _temp()\n" % repr(name))
498 results.write("\n")
499 #I had wanted to do "del _temp" at each stage (just for clarity), but
500 #that pushed the code size just over the Jython JVM limit. We include
501 #one the final "del _temp" to clean up the namespace.
502 results.write("del _temp\n")
503 results.write("\n")
504 print 'OK.\n'
505 results.close()
506 return
507
509 """DB.install_dict() -> None.
510
511 Install the newly created dictionary in the site-packages folder.
512
513 May need super user privilege on some architectures."""
514 print '\n ' +'*'*78 + ' \n'
515 print '\n\t\tInstalling Restriction_Dictionary.py'
516 try:
517 import Bio.Restriction.Restriction_Dictionary as rd
518 except ImportError:
519 print '\
520 \n Unable to locate the previous Restriction_Dictionary.py module\
521 \n Aborting installation.'
522 sys.exit()
523 #
524 # first save the old file in Updates
525 #
526 old = os.path.join(os.path.split(rd.__file__)[0],
527 'Restriction_Dictionary.py')
528 #update_folder = config.updatefolder
529 update_folder = os.getcwd()
530 shutil.copyfile(old, os.path.join(update_folder,
531 'Restriction_Dictionary.old'))
532 #
533 # Now test and install.
534 #
535 new = os.path.join(update_folder, 'Restriction_Dictionary.py')
536 try:
537 execfile(new)
538 print '\
539 \n\tThe new file seems ok. Proceeding with the installation.'
540 except SyntaxError:
541 print '\
542 \n The new dictionary file is corrupted. Aborting the installation.'
543 return
544 try:
545 shutil.copyfile(new, old)
546 print'\n\t Everything ok. If you need it a version of the old\
547 \n\t dictionary have been saved in the Updates folder under\
548 \n\t the name Restriction_Dictionary.old.'
549 print '\n ' +'*'*78 + ' \n'
550 except IOError:
551 print '\n ' +'*'*78 + ' \n'
552 print '\
553 \n\t WARNING : Impossible to install the new dictionary.\
554 \n\t Are you sure you have write permission to the folder :\n\
555 \n\t %s ?\n\n' % os.path.split(old)[0]
556 return self.no_install()
557 return
558
560 """BD.no_install() -> None.
561
562 build the new dictionary but do not install the dictionary."""
563 print '\n ' +'*'*78 + '\n'
564 #update = config.updatefolder
565 try:
566 import Bio.Restriction.Restriction_Dictionary as rd
567 except ImportError:
568 print '\
569 \n Unable to locate the previous Restriction_Dictionary.py module\
570 \n Aborting installation.'
571 sys.exit()
572 #
573 # first save the old file in Updates
574 #
575 old = os.path.join(os.path.split(rd.__file__)[0],
576 'Restriction_Dictionary.py')
577 update = os.getcwd()
578 shutil.copyfile(old, os.path.join(update, 'Restriction_Dictionary.old'))
579 places = update, os.path.split(Bio.Restriction.Restriction.__file__)[0]
580 print "\t\tCompilation of the new dictionary : OK.\
581 \n\t\tInstallation : No.\n\
582 \n You will find the newly created 'Restriction_Dictionary.py' file\
583 \n in the folder : \n\
584 \n\t%s\n\
585 \n Make a copy of 'Restriction_Dictionary.py' and place it with \
586 \n the other Restriction libraries.\n\
587 \n note : \
588 \n This folder should be :\n\
589 \n\t%s\n" % places
590 print '\n ' +'*'*78 + '\n'
591 return
592
593
595 """BD.lastrebasefile() -> None.
596
597 Check the emboss files are up to date and download them if they are not.
598 """
599 embossnames = ('emboss_e', 'emboss_r', 'emboss_s')
600 #
601 # first check if we have the last update:
602 #
603 emboss_now = ['.'.join((x,LocalTime())) for x in embossnames]
604 update_needed = False
605 #dircontent = os.listdir(config.Rebase) # local database content
606 dircontent = os.listdir(os.getcwd())
607 base = os.getcwd() # added for biopython current directory
608 for name in emboss_now:
609 if name in dircontent:
610 pass
611 else:
612 update_needed = True
613
614 if not update_needed:
615 #
616 # nothing to be done
617 #
618 print '\n Using the files : %s'% ', '.join(emboss_now)
619 return tuple([open(os.path.join(base, n)) for n in emboss_now])
620 else:
621 #
622 # may be download the files.
623 #
624 print '\n The rebase files are more than one month old.\
625 \n Would you like to update them before proceeding?(y/n)'
626 r = raw_input(' update [n] >>> ')
627 if r in ['y', 'yes', 'Y', 'Yes']:
628 updt = RebaseUpdate(self.rebase_pass, self.proxy)
629 updt.openRebase()
630 updt.getfiles()
631 updt.close()
632 print '\n Update complete. Creating the dictionaries.\n'
633 print '\n Using the files : %s'% ', '.join(emboss_now)
634 return tuple([open(os.path.join(base, n)) for n in emboss_now])
635 else:
636 #
637 # we will use the last files found without updating.
638 # But first we check we have some file to use.
639 #
640 class NotFoundError(Exception):
641 pass
642 for name in embossnames:
643 try:
644 for file in dircontent:
645 if file.startswith(name):
646 break
647 else:
648 pass
649 raise NotFoundError
650 except NotFoundError:
651 print "\nNo %s file found. Upgrade is impossible.\n"%name
652 sys.exit()
653 continue
654 pass
655 #
656 # now find the last file.
657 #
658 last = [0]
659 for file in dircontent:
660 fs = file.split('.')
661 try:
662 if fs[0] in embossnames and int(fs[1]) > int(last[-1]):
663 if last[0] : last.append(fs[1])
664 else : last[0] = fs[1]
665 else:
666 continue
667 except ValueError:
668 continue
669 last.sort()
670 last = last[::-1]
671 if int(last[-1]) < 100 : last[0], last[-1] = last[-1], last[0]
672 for number in last:
673 files = [(name, name+'.%s'%number) for name in embossnames]
674 strmess = '\nLast EMBOSS files found are :\n'
675 try:
676 for name,file in files:
677 if os.path.isfile(os.path.join(base, file)):
678 strmess += '\t%s.\n'%file
679 else:
680 raise ValueError
681 print strmess
682 emboss_e = open(os.path.join(base, 'emboss_e.%s'%number),'r')
683 emboss_r = open(os.path.join(base, 'emboss_r.%s'%number),'r')
684 emboss_s = open(os.path.join(base, 'emboss_s.%s'%number),'r')
685 return emboss_e, emboss_r, emboss_s
686 except ValueError:
687 continue
688
690 line = [line[0]]+[line[1].upper()]+[int(i) for i in line[2:9]]+line[9:]
691 name = line[0].replace("-","_")
692 site = line[1] # sequence of the recognition site
693 dna = DNA(site)
694 size = line[2] # size of the recognition site
695 #
696 # Calculate the overhang.
697 #
698 fst5 = line[5] # first site sense strand
699 fst3 = line[6] # first site antisense strand
700 scd5 = line[7] # second site sense strand
701 scd3 = line[8] # second site antisense strand
702
703 #
704 # the overhang is the difference between the two cut
705 #
706 ovhg1 = fst5 - fst3
707 ovhg2 = scd5 - scd3
708
709 #
710 # 0 has the meaning 'do not cut' in rebase. So we get short of 1
711 # for the negative numbers so we add 1 to negative sites for now.
712 # We will deal with the record later.
713 #
714
715 if fst5 < 0 : fst5 += 1
716 if fst3 < 0 : fst3 += 1
717 if scd5 < 0 : scd5 += 1
718 if scd3 < 0 : scd3 += 1
719
720 if ovhg2 != 0 and ovhg1 != ovhg2:
721 #
722 # different length of the overhang of the first and second cut
723 # it's a pain to deal with and at the moment it concerns only
724 # one enzyme which is not commercially available (HaeIV).
725 # So we don't deal with it but we check the progression
726 # of the affair.
727 # Should HaeIV become commercially available or other similar
728 # new enzymes be added, this might be modified.
729 #
730 print '\
731 \nWARNING : %s cut twice with different overhang length each time.\
732 \n\tUnable to deal with this behaviour. \
733 \n\tThis enzyme will not be included in the database. Sorry.' %name
734 print '\tChecking :',
735 raise OverhangError
736 if 0 <= fst5 <= size and 0 <= fst3 <= size:
737 #
738 # cut inside recognition site
739 #
740 if fst5 < fst3:
741 #
742 # 5' overhang
743 #
744 ovhg1 = ovhgseq = site[fst5:fst3]
745 elif fst5 > fst3:
746 #
747 # 3' overhang
748 #
749 ovhg1 = ovhgseq = site[fst3:fst5]
750 else:
751 #
752 # blunt
753 #
754 ovhg1 = ovhgseq = ''
755 for base in 'NRYWMSKHDBV':
756 if base in ovhg1:
757 #
758 # site and overhang degenerated
759 #
760 ovhgseq = ovhg1
761 if fst5 < fst3 : ovhg1 = - len(ovhg1)
762 else : ovhg1 = len(ovhg1)
763 break
764 else:
765 continue
766 elif 0 <= fst5 <= size:
767 #
768 # 5' cut inside the site 3' outside
769 #
770 if fst5 < fst3:
771 #
772 # 3' cut after the site
773 #
774 ovhgseq = site[fst5:] + (fst3 - size) * 'N'
775 elif fst5 > fst3:
776 #
777 # 3' cut before the site
778 #
779 ovhgseq = abs(fst3) * 'N' + site[:fst5]
780 else:
781 #
782 # blunt outside
783 #
784 ovhg1 = ovhgseq = ''
785 elif 0 <= fst3 <= size:
786 #
787 # 3' cut inside the site, 5' outside
788 #
789 if fst5 < fst3:
790 #
791 # 5' cut before the site
792 #
793 ovhgseq = abs(fst5) * 'N' + site[:fst3]
794 elif fst5 > fst3:
795 #
796 # 5' cut after the site
797 #
798 ovhgseq = site[fst3:] + (fst5 - size) * 'N'
799 else:
800 #
801 # should not happend
802 #
803 raise ValueError('Error in #1')
804 elif fst3 < 0 and size < fst5:
805 #
806 # 3' overhang. site is included.
807 #
808 ovhgseq = abs(fst3)*'N' + site + (fst5-size)*'N'
809 elif fst5 < 0 and size <fst3:
810 #
811 # 5' overhang. site is included.
812 #
813 ovhgseq = abs(fst5)*'N' + site + (fst3-size)*'N'
814 else:
815 #
816 # 5' and 3' outside of the site
817 #
818 ovhgseq = 'N' * abs(ovhg1)
819 #
820 # Now line[5] to [8] are the location of the cut but we have to
821 # deal with the weird mathematics of biologists.
822 #
823 # EMBOSS sequence numbering give:
824 # DNA = 'a c g t A C G T'
825 # -1 1 2 3 4
826 #
827 # Biologists do not know about 0. Too much use of latin certainly.
828 #
829 # To compensate, we add 1 to the positions if they are negative.
830 # No need to modify 0 as it means no cut and will not been used.
831 # Positive numbers should be ok since our sequence starts 1.
832 #
833 # Moreover line[6] and line[8] represent cut on the reverse strand.
834 # They will be used for non palindromic sites and sre.finditer
835 # will detect the site in inverse orientation so we need to add the
836 # length of the site to compensate (+1 if they are negative).
837 #
838 for x in (5, 7):
839 if line[x] < 0 : line[x] += 1
840 for x in (6, 8):
841 if line[x] > 0 : line[x] -= size
842 elif line[x] < 0 : line[x] = line[x] - size + 1
843 #
844 # now is the site palindromic?
845 # produce the regular expression which correspond to the site.
846 # tag of the regex will be the name of the enzyme for palindromic
847 # enzymesband two tags for the other, the name for the sense sequence
848 # and the name with '_as' at the end for the antisense sequence.
849 #
850 rg = ''
851 if is_palindrom(dna):
852 line.append(True)
853 rg = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
854 else:
855 line.append(False)
856 sense = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
857 antisense = ''.join(['(?P<', name, '_as>',
858 regex(Antiparallel(dna)), ')'])
859 rg = sense + '|' + antisense
860 #
861 # exact frequency of the site. (ie freq(N) == 1, ...)
862 #
863 f = [4/len(dna_alphabet[l]) for l in site.upper()]
864 freq = reduce(lambda x, y : x*y, f)
865 line.append(freq)
866 #
867 # append regex and ovhg1, they have not been appended before not to
868 # break the factory class. simply to leazy to make the changes there.
869 #
870 line.append(rg)
871 line.append(ovhg1)
872 line.append(ovhgseq)
873 return line
874
876 #
877 # remove the heading of the file.
878 #
879 return [l for l in itertools.dropwhile(lambda l:l.startswith('#'),file)]
880
882 #
883 # emboss_r.txt, separation between blocks is //
884 #
885 take = itertools.takewhile
886 block = [l for l in take(lambda l :not l.startswith('//'),file[index:])]
887 index += len(block)+1
888 return block, index
889
891 #
892 # take what we want from the block.
893 # Each block correspond to one enzyme.
894 # block[0] => enzyme name
895 # block[3] => methylation (position and type)
896 # block[5] => suppliers (as a string of single letter)
897 #
898 bl3 = block[3].strip()
899 if not bl3 : bl3 = False # site is not methylable
900 return (block[0].strip(), bl3, block[5].strip())
901
903 #
904 # Mix all the information from the 3 files and produce a coherent
905 # restriction record.
906 #
907 methfile = self.removestart(file1)
908 sitefile = self.removestart(file2)
909 supplier = self.removestart(file3)
910
911 i1, i2= 0, 0
912 try:
913 while True:
914 block, i1 = self.getblock(methfile, i1)
915 bl = self.get(block)
916 line = (sitefile[i2].strip()).split()
917 name = line[0]
918 if name == bl[0]:
919 line.append(bl[1]) # -> methylation
920 line.append(bl[2]) # -> suppliers
921 else:
922 bl = self.get(oldblock)
923 if line[0] == bl[0]:
924 line.append(bl[1])
925 line.append(bl[2])
926 i2 += 1
927 else:
928 raise TypeError
929 oldblock = block
930 i2 += 1
931 try:
932 line = self.parseline(line)
933 except OverhangError : # overhang error
934 n = name # do not include the enzyme
935 if not bl[2]:
936 print 'Anyway, %s is not commercially available.\n' %n
937 else:
938 print 'Unfortunately, %s is commercially available.\n'%n
939
940 continue
941 #Hyphens can't be used as a Python name, nor as a
942 #group name in a regular expression.
943 name = name.replace("-","_")
944 if name in enzymedict:
945 #
946 # deal with TaqII and its two sites.
947 #
948 print '\nWARNING :',
949 print name, 'has two different sites.\n'
950 other = line[0].replace("-","_")
951 dna = DNA(line[1])
952 sense1 = regex(dna.tostring())
953 antisense1 = regex(Antiparallel(dna))
954 dna = DNA(enzymedict[other][0])
955 sense2 = regex(dna.tostring())
956 antisense2 = regex(Antiparallel(dna))
957 sense = '(?P<'+other+'>'+sense1+'|'+sense2+')'
958 antisense = '(?P<'+other+'_as>'+antisense1+'|'+antisense2 + ')'
959 reg = sense + '|' + antisense
960 line[1] = line[1] + '|' + enzymedict[other][0]
961 line[-1] = reg
962 #
963 # the data to produce the enzyme class are then stored in
964 # enzymedict.
965 #
966 enzymedict[name] = line[1:] #element zero was the name
967 except IndexError:
968 pass
969 for i in supplier:
970 #
971 # construction of the list of suppliers.
972 #
973 t = i.strip().split(' ', 1)
974 suppliersdict[t[0]] = (t[1], [])
975 return
976
| Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Wed Dec 16 11:27:35 2009 | http://epydoc.sourceforge.net |