"""
.. module:: orgasm.tango
:platform: Unix
:synopsis: The :py:mod:`~orgasm.tango` package contains a set of functions useful to manage assembling structure.
The :py:mod:`orgasm.tango` package
==================================
The :py:mod:`~orgasm.tango` package contains a set of functions useful to manage assembling structure.
.. moduleauthor:: Eric Coissac <eric.coissac@inria.fr>
:author: Eric Coissac
:contact: eric.coissac@inria.fr
.. warning::
The :py:mod:`~orgasm.tango` package functions aims to be integrated
into other packages as standalone functions of class methods.
"""
import sys
from orgasm.multialign import multiAlignReads, consensus # @UnresolvedImport
from orgasm.assembler import Assembler # @UnresolvedImport
from orgasm.assembler import buildstem # @UnresolvedImport
from orgasm.assembler import tango, getusedreads # @UnresolvedImport
from orgasm.utils import tags2str,bytes2str
from time import time
import math
from functools import reduce
from sys import stderr
from orgasm.multialign._multi import alignSequence # @UnresolvedImport
def logOrPrint(logger,message,level='info'):
if logger is not None:
getattr(logger, level)(message)
else:
print(message,file=sys.stderr)
[docs]def cutLowCoverage(self,mincov,terminal=True):
'''
Remove sequences in the assembling graph with a coverage below ``mincov``.
.. code-block :: ipython
In [159]: asm = Assembler(r)
In [160]: s = matchtoseed(m,r)
In [161]: a = tango(asm,s,mincov=1,minread=10,minoverlap=30,maxjump=0,cycle=1)
In [162]: asm.cleanDeadBranches(maxlength=10)
Remaining edges : 424216 node : 423896
Out[162]: 34821
In [162]: cutLowCoverage(asm,10,terminal=False)
:param mincov: coverage threshold
:type mincov: :py:class:`int`
:param terminal: if set to ``True`` only terminal edges are removed from the assembling graph
:type terminal: :py:class:`bool`
:return: the count of deleted node
:rtype: :py:class:`int`
:seealso: :py:meth:`~orgasm.assambler.Assembler.cleanDeadBranches`
'''
def isTerminal(g,n):
return len(list(g.parentIterator(n)))==0 or len(list(g.neighbourIterator(n)))==0
def endnodeset(g):
return set(g.nodeIterator(predicate = lambda n : (len(list(g.neighbourIterator(n)))==0)))
def startnodeset(g):
return set(g.nodeIterator(predicate = lambda n : (len(list(g.parentIterator(n)))==0)))
ontty = sys.stderr.isatty()
if terminal:
tstates=[True]
else:
tstates=[True,False]
ilength=len(self)
cg = self.compactAssembling(verbose=False)
index = self.index
readSize = index.getReadSize()
for terminal in tstates:
print('',file=sys.stderr)
if terminal:
print("Deleting terminal branches",file=sys.stderr)
else:
print("Deleting internal branches",file=sys.stderr)
extremities = endnodeset(cg) | startnodeset(cg)
go = True
while go:
go = False
stems = [x for x in cg.edgeIterator()
if not terminal or (isTerminal(cg, x[0]) or isTerminal(cg, x[1]))]
if stems:
stems.sort(key=lambda i:cg.getEdgeAttr(*i)['weight'],reverse=True)
lightest = stems.pop()
lattr = cg.getEdgeAttr(*lightest)
if lattr['weight'] < mincov:
if stems:
go=True
for n in lattr['path'][1:-1]:
if n in self.graph:
try:
del self.graph[n]
except KeyError:
pass
if lightest[0] in extremities and lightest[0] in self.graph:
del self.graph[lightest[0]]
if lightest[1] in extremities and lightest[1] in self.graph:
del self.graph[lightest[1]]
if ontty:
print("Remaining edges : %d node : %d\r" % (self.graph.edgeCount(),len(self)),
end='\r',
file=sys.stderr)
cg.deleteEdge(*lightest)
tojoin=[]
if lightest[0] in extremities:
del cg[lightest[0]]
else:
tojoin.append(lightest[0])
if lightest[1] in extremities:
del cg[lightest[1]]
else:
tojoin.append(lightest[1])
# print >>sys.stderr,lightest[0] in extremities,lightest[1] in extremities,tojoin
for c in tojoin:
if c in cg:
begin = list(cg.parentIterator(c))
end = list(cg.neighbourIterator(c))
if len(begin)==1 and len(end)==1:
begin = begin[0]
end = end[0]
e1s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==begin and e[1]==c))
e2s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==c and e[1]==end))
if len(e1s)==1 and len(e2s)==1:
e1 = e1s[0]
e2 = e2s[0]
attr1 = cg.getEdgeAttr(*e1)
attr2 = cg.getEdgeAttr(*e2)
sequence=attr1['sequence'] + attr2['sequence']
path=attr1['path'] + attr2['path'][1:]
stem = buildstem(self,
begin,
end,
sequence,
path,
False)
stem['graphics']={'width':int(stem['weight']/readSize)+1,
'arrow':'last'}
cg.addStem(stem)
cg.deleteEdge(*e2)
cg.deleteEdge(*e1)
del cg[c]
if ontty:
print('',file=sys.stderr)
return ilength - len(self)
[docs]def cutLowSeeds(self,minseeds,seeds,terminal=True):
'''
Remove sequences in the assembling graph with a coverage below ``mincov``.
.. code-block :: ipython
In [159]: asm = Assembler(r)
In [160]: s = matchtoseed(m,r)
In [161]: a = tango(asm,s,mincov=1,minread=10,minoverlap=30,maxjump=0,cycle=1)
In [162]: asm.cleanDeadBranches(maxlength=10)
Remaining edges : 424216 node : 423896
Out[162]: 34821
In [162]: cutLowCoverage(asm,10,terminal=False)
:param mincov: coverage threshold
:type mincov: :py:class:`int`
:param terminal: if set to ``True`` only terminal edges are removed from the assembling graph
:type terminal: :py:class:`bool`
:return: the count of deleted node
:rtype: :py:class:`int`
:seealso: :py:meth:`~orgasm.assambler.Assembler.cleanDeadBranches`
'''
def isTerminal(g,n):
return len(list(g.parentIterator(n)))==0 or len(list(g.neighbourIterator(n)))==0
def endnodeset(g):
return set(g.nodeIterator(predicate = lambda n : (len(list(g.neighbourIterator(n)))==0)))
def startnodeset(g):
return set(g.nodeIterator(predicate = lambda n : (len(list(g.parentIterator(n)))==0)))
ontty = sys.stderr.isatty()
if terminal:
tstates=[True]
else:
tstates=[True,False]
index = self.index
readSize = index.getReadSize()
ilength=len(self)
cg = self.compactAssembling(verbose=False)
genesincontig(cg,index,seeds)
for terminal in tstates:
print('',file=sys.stderr)
if terminal:
print("Deleting terminal branches",file=sys.stderr)
else:
print("Deleting internal branches",file=sys.stderr)
extremities = endnodeset(cg) | startnodeset(cg)
go = True
while go:
go = False
stems = [x for x in cg.edgeIterator()
if (not terminal
or (isTerminal(cg, x[0]) or isTerminal(cg, x[1])))
and 'ingene' in cg.getEdgeAttr(*x)
]
if stems:
stems.sort(key=lambda i:cg.getEdgeAttr(*i)['ingene'],reverse=True)
lightest = stems.pop()
lattr = cg.getEdgeAttr(*lightest)
if lattr['ingene'] < minseeds:
if stems:
go=True
for n in lattr['path'][1:-1]:
if n in self.graph:
try:
del self.graph[n]
except KeyError:
pass
if lightest[0] in extremities and lightest[0] in self.graph:
del self.graph[lightest[0]]
if lightest[1] in extremities and lightest[1] in self.graph:
del self.graph[lightest[1]]
if ontty:
print("Remaining edges : %d node : %d\r" % (self.graph.edgeCount(),len(self)),
end='\r',
file=sys.stderr)
cg.deleteEdge(*lightest)
tojoin=[]
if lightest[0] in extremities:
del cg[lightest[0]]
else:
tojoin.append(lightest[0])
if lightest[1] in extremities:
del cg[lightest[1]]
else:
tojoin.append(lightest[1])
# print >>sys.stderr,lightest[0] in extremities,lightest[1] in extremities,tojoin
for c in tojoin:
if c in cg:
begin = list(cg.parentIterator(c))
end = list(cg.neighbourIterator(c))
if len(begin)==1 and len(end)==1:
begin = begin[0]
end = end[0]
e1s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==begin and e[1]==c))
e2s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==c and e[1]==end))
if len(e1s)==1 and len(e2s)==1:
e1 = e1s[0]
e2 = e2s[0]
attr1 = cg.getEdgeAttr(*e1)
attr2 = cg.getEdgeAttr(*e2)
sequence=attr1['sequence'] + attr2['sequence']
path=attr1['path'] + attr2['path'][1:]
stem = buildstem(self,
begin,
end,
sequence,
path,
False)
stem['graphics']={'width':int(stem['weight']/readSize)+1,
'arrow':'last'}
cg.addStem(stem)
cg.deleteEdge(*e2)
cg.deleteEdge(*e1)
del cg[c]
if ontty:
print('',file=sys.stderr)
return ilength - len(self)
def cutSNPs(self,maxlength=500):
ontty = sys.stderr.isatty()
cg = self.compactAssembling(verbose=False)
snps = [(snp,
1 if snp[2][0][1][2] > snp[2][1][1][2] else 0) # indicate which allele needs to be elimimated
for snp in [(i, # start node
next(cg.neighbourIterator(i)), # end node
[(k[2], # allele id (0 or 1)
(cg.getEdgeAttr(*k)['stemid'], # allele stemid
cg.getEdgeAttr(*k)['length'], # allele length
cg.getEdgeAttr(*k)['weight']) # allele coverage
)
for k in cg.edgeIterator(edgePredicate=lambda j:j[0]==i)
]
)
for i in cg.nodeIterator(predicate=lambda n :
len(list(cg.neighbourIterator(n))) == 1
and len(list(cg.edgeIterator(edgePredicate=lambda e : e[0]==n)))==2)
] if snp[2][0][1][0]>0 # keep one copy of the snp
and snp[2][0][1][0]!=-snp[2][1][1][0] # exclude pairs of reverse-complement allele
and snp[2][0][1][1]<maxlength # exclude allele longer than 500 bp
and snp[2][1][1][1]<maxlength #
]
for snp in snps:
edge = cg.getEdgeAttr(snp[0][0],snp[0][1],snp[1])
nodes = edge['path'][1:-1]
if ontty:
print("Remaining edges : %d node : %d" % (self.graph.edgeCount(),len(self)),
end='\r',
file=sys.stderr)
for node in nodes:
if node in self.graph:
del self.graph[node]
if ontty:
print("",file=sys.stderr)
[docs]def mode(data):
'''
Compute a raw estimation of the mode of a data set
:param data: The data set to analyse
:type data: a permanent iterable object (list, tuble...)
'''
data = list(data)
if len(data) == 0:
return None
if len(data) < 8:
return int(sum(data)/len(data))
data.sort()
xx = [0,0,0]
for j in range(3):
windows = len(data)//2
intervals = [data[i+windows]-data[i] for i in range(windows)]
mininter = min(intervals)
begininter = intervals.index(mininter)
data = data[begininter:(begininter+windows)]
xx[j]=sum(data)/windows
return sum(xx)/3
def weightedMode(data):
d = []
for x,w in data:
d.extend([x] * w)
return mode(d)
def matchtoseed(matches,index,new=None):
s=[]
if new is None:
new=list(matches.keys())
for p in new:
m = matches[p][1]
k = m.keys()
for x in k:
s.extend((index.getIds(y[0])[0],(x,),0) for y in m[x])
return s
def matchtogene(matches):
genes = {}
if matches is not None:
for p in matches: # Loop over probe set
m = matches[p][1]
k = m.keys()
for x in k: # Loop over a probes in a set
for i in m[x]: # loop over matches in a probe
if i[0] < 0:
pos = -i[5]
else:
pos = i[5]
genes[abs(i[0])]=(x,pos)
return genes
def genesincontig(cg,index,matches):
def vread(i):
if abs(i) > len(index):
v=0
elif abs(i) in genes:
v=1
else:
v=-1
return v
genes = matchtogene(matches)
ei = cg.edgeIterator(edgePredicate=lambda e: 'path' in cg.getEdgeAttr(*e))
for e in ei:
ea = cg.getEdgeAttr(*e)
if ea['class']=="sequence":
path = ea['path']
eg = [vread(i) for i in path]
ep = [genes.get(abs(i),()) for i in path]
g = sum(i for i in eg if i > 0)
if g > 0:
g=max(min(math.ceil(math.log10(g)),4)*64-1,0)
color="#00%02X%02X" % (g,255-g)
else:
color="#0000FF"
graphics = ea.get('graphics',{})
graphics['arrow']='last'
graphics['fill']=color
ea['graphics']=graphics
ea['ingene']=g
ea['genepos']=ep
def testOverlap(seqfrom,seqto,headto,readsize):
seqto=headto+seqto
lseq=min(len(seqfrom),len(seqto),readsize)
seqfrom = seqfrom[-lseq:]
seqto = seqto[0:lseq]
ali = alignSequence(seqfrom,seqto)
ok = (not ali[0] \
and ali[1] > 0 \
and len(ali[2])==1 \
and len(ali[3])==1)
if ok:
return lseq - ali[2][0]
else:
return -1
[docs]def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=set(),logger=None):
'''
Add relationships between edges of the assembling graph related to the
par ended links.
:param assgraph: The compact assembling graph as produced by the
:py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
:type assgraph: :py:class:`~orgasm.graph.DiGraphMultiEdge`
:param minlink: the minimum count of pair ended link to consider
for asserting the relationship
:type minlink: :py:class:`int`
:param back: How many base pairs must be considered at the end of each edge
:type back: :py:class:`int`
:param addConnectedLink: add to the assembling graph green edges for each directly
connected edge pair representing the pair ended links
asserting the connection.
:type addConnectedLink: :py:class:`bool`
'''
frglen,frglensd = estimateFragmentLength(self)
readsize=self.index.getReadSize()
if frglen is None:
logger.warning("Insert size cannot be estimated")
frglen=300
frglensd=1
logger.warning("Insert size set to %d" % frglen)
nforcedLink = set((-p[1],-p[0])
if abs(p[0]) > abs(p[1])
else (p[0],p[1])
for p in forcedLink)
eiat = dict((assgraph.getEdgeAttr(*i)['stemid'],
assgraph.getEdgeAttr(*i)) for i in assgraph.edgeIterator())
maxstemid = max(eiat)
if addConnectedLink:
# Fake function just testing the stemid attribute
def isInitial(n):
return 'stemid' in assgraph.getEdgeAttr(*n)
def isTerminal(n):
return 'stemid' in assgraph.getEdgeAttr(*n)
else:
def isInitial(n):
return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
def isTerminal(n):
return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
nei = len(ei)
net = len(et)
s=[]
for e1 in range(net):
for e2 in range(nei):
npair = ((etid[e1], eiid[e2])
if (abs(etid[e1]) < abs(eiid[e2]))
else (-eiid[e2],-etid[e1]))
connected = et[e1][1]==ei[e2][0]
linkedby,ml,sl,delta = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
first=assgraph.getEdgeAttr(*et[e1])['last']
last=assgraph.getEdgeAttr(*ei[e2])['first']
if connected and addConnectedLink:
if linkedby >= minlink:
s.append(('l',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#00FF00",ml,sl,delta,first,last))
logOrPrint(logger,
"%d -> %d connection asserted by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
else:
logOrPrint(logger,
"%d -> %d connection not asserted by pair ended link" % (etid[e1], eiid[e2]))
elif not connected and npair in nforcedLink:
s.append(('f',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF0000",ml,sl,delta,first,last))
if linkedby > 0:
logOrPrint(logger,
"%d -> %d forced but supported by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
else:
logOrPrint(logger,
"%d -> %d forced and not supported by pair ended links" % (etid[e1], eiid[e2]))
elif not connected and linkedby >= minlink:
overlap = testOverlap(eiat[etid[e1]]['sequence'],
eiat[eiid[e2]]['sequence'],
eiat[eiid[e2]]['head'],
readsize)
if overlap >= 0:
s.append(('o',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF00FF",overlap,0,[],first,last))
logOrPrint(logger,
"%d -> %d overlap of %dbp supported by %d pair ended links" % (etid[e1], eiid[e2],overlap,linkedby))
else:
s.append(('s',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF6600",ml,sl,delta,first,last))
logOrPrint(logger,
"%d -> %d scaffolded by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
nstemid={}
for kind,x,y,z,s1,s2,color,ml,sl,delta,first,last in s:
if (abs(x) > abs(y)):
npair=(-y,-x)
nsign=-1
else:
npair=(x,y)
nsign=1
if npair not in nstemid:
maxstemid+=1
nstemid[npair]=maxstemid
nid=nstemid[npair] * nsign
attr = assgraph.addEdge(x,y)
if kind=="s":
glengths = [frglen - i - 2 * readsize
for i in delta]
pglengths = [i for i in glengths if i >=0]
if len(pglengths) > 1:
glength = sum(pglengths) / len(pglengths)
glengthsd= math.sqrt(sum((i-glength)**2 for i in pglengths) /(len(pglengths)-1))
else:
glength = sum(glengths) / len(glengths)
glengthsd= math.sqrt(sum((i-glength)**2 for i in glengths) /(len(glengths)-1))
attr['label']="%d : Gap (%dbp) [%d,%d] %d -> %d" % (nid,glength,z,len(pglengths),s1,s2)
attr['length']=int(glength) if int(glength) > 0 else 10
attr['first']=first
attr['last']=last
attr['weight']=0
attr['gappairs']=len(glengths)
attr['gaplength']=int(glength)
attr['gapsd']=int(math.sqrt(frglensd**2+glengthsd**2))
attr['gapdeltas']=[frglen - i - 2 * readsize for i in delta]
attr['pairendlink']=z
attr['ingene']=0
attr['link']=(s1,s2)
attr['graphics']={'width':z // 10.,
'arrow':'last',
'fill':color
}
attr['stemid']=nid
attr['sequence']=b"N" * attr['length']
attr['path']=[first] + [0] * (attr['length']-1) + [last]
attr['class']='scaffold:paired-end'
elif kind=="o":
attr['label']="%d : Overlap (%dbp) [%d] %d -> %d" % (nid,ml,z,s1,s2)
attr['length']=-ml
attr['first']=first
attr['last']=last
attr['weight']=0
attr['pairendlink']=z
attr['ingene']=0
attr['link']=(s1,s2)
attr['graphics']={'width':1,
'arrow':'last',
'fill':color
}
attr['stemid']=nid
attr['sequence']=b''
attr['path']=[first]+ [0] * (readsize-ml-1) +[last]
attr['class']='scaffold:overlap'
elif kind=="f":
glengths = [frglen - i - 2 * readsize
for i in delta]
pglengths = [i for i in glengths if i >=0]
if len(pglengths) > 1:
glength = sum(pglengths) / len(pglengths)
glengthsd= math.sqrt(sum((i-glength)**2 for i in pglengths) /(len(pglengths)-1))
else:
glength = sum(glengths) / len(glengths)
glengthsd= math.sqrt(sum((i-glength)**2 for i in glengths) /(len(glengths)-1))
attr['label']="%d : Forced [%d] %d -> %d" % (nid,z,s1,s2)
attr['length']= int(glength) if glength > 0 else 10
attr['first']=first
attr['last']=last
attr['weight']=0
attr['gappairs']=len(glengths)
attr['gaplength']=int(glength)
attr['gapsd']=int(math.sqrt(frglensd**2+glengthsd**2))
attr['gapdeltas']=[frglen - i - readsize for i in delta]
attr['pairendlink']=z
attr['ingene']=0
attr['link']=(s1,s2)
attr['graphics']={'width':z // 10.,
'arrow':'last',
'fill' :color
}
attr['stemid']=nid
attr['sequence']=b"N" * attr['length']
attr['path']=[first] + [0] * (attr['length']-1) + [last]
attr['class']='scaffold:forced'
else:
attr['class']='internal'
__cacheAli = set()
__cacheAli2 = set()
[docs]def fillGaps(self,minlink=5,
back=200,
kmer=12,
smin=40,
delta=0,
cmincov=5,
minread=20,
minratio=0.1,
emincov=1,
maxlength=None,
gmincov=1,
minoverlap=60,
lowfilter=True,
adapters5=(),
adapters3=(),
maxjump=0,
snp=False,
nodeLimit=1000000,
onlyLinking=False,
useonce=True,
logger=None):
'''
:param minlink:
:param back:
:param kmer:
:param smin:
:param delta:
:param cmincov:
:param minread:
:param minratio:
:param emincov:
:param maxlength:
:param gmincov:
:param minoverlap:
:param lowfilter:
:param maxjump:
:param snp: If set to True (default value is False) erase SNP variation
by conserving the most abundant version
'''
global __cacheAli
global __cacheAli2
__cacheAli = __cacheAli2
__cacheAli2 = set()
def isInitial(n):
return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
def isTerminal(n):
return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
assgraph = self.compactAssembling(verbose=False)
index = self.index
#List of edges not connected at their beginning
ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
#List of edges not connected at their end
et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
#Corresponding id of these edges
eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
#epi = [set(assgraph.getEdgeAttr(*i)['path'][0:back]) for i in ei]
ept = [set(assgraph.getEdgeAttr(*i)['path'][-back:]) for i in et]
eei = [set(assgraph.getEdgeAttr(*i)['path'][0:100]) for i in ei]
eet = [set(assgraph.getEdgeAttr(*i)['path'][-100:]) for i in et]
print(eiid,file=sys.stderr) # <EC>
exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid]
ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid]
nei = len(ei)
net = len(et)
s=[]
maxcycle=max(self.graph.getNodeAttr(i)['cycle'] for i in self.graph)
lassemb = len(self)
cycle = maxcycle
linked=set()
extended=set()
for e1 in range(net):
for e2 in range(nei):
connected = et[e1][1]==ei[e2][0]
if not connected:
linkedby,ml,sl,pdelta = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back) # @UnusedVariable
if linkedby >= minlink and abs(etid[e1]) <= abs(eiid[e2]):
extended.add(etid[e1])
extended.add(-eiid[e2])
if (etid[e1],eiid[e2]) not in linked:
linked.add((-eiid[e2],-etid[e1]))
print("\n\nLinking Stems %d -> %d" % (etid[e1],eiid[e2]),
file=sys.stderr)
# ex = frozenset(((ext[e1] | exi[e2]) - ept[e1] - epi[e2]) | eet[e1] | eei[e2])
ex = frozenset(ext[e1] | exi[e2] | eet[e1] | eei[e2])
ingraph = sum(i in self.graph for i in ex)
nreads = len(ex)
if ingraph < nreads:
logOrPrint(logger,
"--> %d | %d = %d reads to align (%d already assembled)" % (len(ext[e1]),len(exi[e2]),nreads,ingraph),
)
if nreads > 10:
__cacheAli2.add(ex)
if ex not in __cacheAli:
ali= multiAlignReads(ex,index,kmer,smin,delta)
print('',file=sys.stderr)
#goodali = [i for i in ali if len(i) >= nreads/4]
goodali=ali
logOrPrint(logger,
"--> %d consensus to add" % len(goodali))
for a in goodali:
#print(b'\n'.join(alignment2bytes(a,index)).decode('ascii'))
cycle+=1
c = consensus(a,index,cmincov)
s = insertFragment(self,c,cycle=cycle)
print(" %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
file=sys.stderr)
a = tango(self,
seeds = s,
minread = minread,
minratio = minratio,
mincov = emincov,
minoverlap = minoverlap,
lowfilter = lowfilter,
adapters5 = adapters5,
adapters3 = adapters3,
maxjump = maxjump,
cycle = cycle,
nodeLimit = nodeLimit)
print('',file=sys.stderr)
else:
logOrPrint(logger,
"--> already aligned")
if not onlyLinking:
for e1 in range(net):
if etid[e1] not in extended:
print("\n\nExtending Stems %d" % (etid[e1]),
file=sys.stderr)
ex = frozenset((ext[e1] - ept[e1]) | eet[e1])
nreads = len(ex)
print("--> %d reads to align" % (nreads),
file=sys.stderr)
if nreads > 10:
__cacheAli2.add(ex)
if ex not in __cacheAli:
ali= multiAlignReads(ex,index,kmer,smin,delta)
print('',file=sys.stderr)
#goodali = [i for i in ali if len(i) >= nreads/4]
goodali=ali
print("--> %d consensus to add" % len(goodali),
file=sys.stderr)
for a in goodali:
c = consensus(a,index,cmincov)
if c:
cycle+=1
s = insertFragment(self,c,cycle=cycle)
print(" %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
file=sys.stderr)
a = tango(self,
seeds = s,
minread = minread,
minratio = minratio,
mincov = emincov,
minoverlap = minoverlap,
lowfilter = lowfilter,
adapters5 = adapters5,
adapters3 = adapters3,
maxjump = maxjump,
cycle = cycle,
nodeLimit = nodeLimit)
print("",file=sys.stderr)
else:
print("--> already aligned",file=sys.stderr)
self.cleanDeadBranches(maxlength=10)
cutLowCoverage(self,gmincov,terminal=True)
# cutLowCoverage(self,int(gmincov/3),terminal=False)
if maxlength is not None:
smallbranches = maxlength
else:
smallbranches = estimateDeadBrancheLength(self)
print(" Dead branch length setup to : %d bp" % smallbranches,
file=sys.stderr)
self.cleanDeadBranches(maxlength=smallbranches)
if snp:
cutSNPs(self)
newnodes = len(self) - lassemb
print('',file=sys.stderr)
print("#######################################################",file=sys.stderr)
print("#",file=sys.stderr)
print("# Added : %d bp (total=%d bp)" % (newnodes/2,len(self)/2),file=sys.stderr)
print("#",file=sys.stderr)
print("#######################################################",file=sys.stderr)
print('',file=sys.stderr)
return newnodes
def insertFragment(self,seq,cycle=1,useonce=True):
index = self.index
rsize = index.getReadSize()
readmax=len(index)+1
seeds = set()
usedreads = getusedreads()
ireadidE=None
if len(seq) >= rsize:
graph = self.graph
probe = seq[0:rsize]
readid = index.getReadIds(probe)
for i in range(1,len(seq)-rsize+1):
coverage = readid[1]
ireadid = readid[0]
if not useonce or ireadid not in usedreads:
seeds.add(ireadid)
if ireadid not in graph:
node = graph.addNode(ireadid)
if 'cycle' not in node:
node['cycle']=cycle
if ireadid < readmax:
node['fake5']=0
node['fake3']=0
else:
node['graphics']={'fill':"#0000FF"}
probe = seq[i:i+rsize]
readidE = index.getReadIds(probe)
coverage = readidE[1]
ireadidE = readidE[0]
if ireadidE not in graph:
node = graph.addNode(ireadidE)
if 'cycle' not in node:
node['cycle']=cycle
if ireadidE < readmax:
node['fake5']=0
node['fake3']=0
else:
node['graphics']={'fill':"#0000FF"}
#if ireadid!=-ireadidE:
edges = graph.addEdge(ireadid,ireadidE)
edges[1]['coverage']=coverage
edges[2]['coverage']=coverage
edges[1]['label']="%s (%d)" % (edges[1]['ext'],coverage)
edges[2]['label']="%s (%d)" % (edges[2]['ext'],coverage)
#else:
# print >>sys.stderr,"\nWARNING : self loop on %d" % ireadidE
readid = readidE
if ireadidE is not None and (not useonce or ireadid not in usedreads):
seeds.add(ireadidE)
return [(-abs(i),("Consensus",),0) for i in seeds]
[docs]def getPairedRead(self,assgraph,stemid,back,end=True):
'''
:param assgraph:
:type assgraph:
:param stemid:
:type stemid:
:param back:
:type back:
:param end:
:type end:
'''
r = self.index
lr = len(r)
if not end:
stemid=-stemid
try:
stem = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e)
and assgraph.getEdgeAttr(*e)['stemid']==stemid))
except StopIteration:
print("ERROR in getPairedRead : requesting stemid=%s" % str(stemid),
file=stderr)
path=assgraph.getEdgeAttr(*stem)['path'][-back:]
reads=[set(r.normalizedPairedEndsReads(abs(i))[1]) for i in path if abs(i) < lr]
paired = set(reduce(lambda a,b: a | b,reads,set()))
if not end:
paired = set(-i for i in paired)
return paired
[docs]def pairEndedConnected(self,assgraph,edge1,edge2,back=250):
'''
Returns how many pair ended reads link two edges in a compact assembling graph
:param assgraph: The compact assembling graph as produced by the
:py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
:type assgraph: :py:class:`~orgasm.graph.DiGraphMultiEdge`
:param edge1: The ``stemid`` of the first edge
:type edge1: :py:class:`int`
:param edge2: The ``stemid`` of the second edge
:type edge2: :py:class:`int`
:param back: How many base pairs must be considered at the end of each edge
:type back: :py:class:`int`
:return: The count of pair ended reads linking both the edges
:rtype: :py:class:`int`
'''
ri = self.index
lri= len(ri)
s1 = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e)
and assgraph.getEdgeAttr(*e)['stemid']==edge1))
s2 = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e)
and assgraph.getEdgeAttr(*e)['stemid']==edge2))
e1 = assgraph.getEdgeAttr(*s1)['path'][-back:]
e2 = assgraph.getEdgeAttr(*s2)['path'][0:back]
de1 = {}
de2 = {}
j = 0
for i in e1:
de1[i] = j
j+=1
# for i in e1:
# l = de1.get(i,[])
# de1[i] = l
# l.append(j)
# j+=1
j = 0
for i in e2:
de2[i] = j
j+=1
pe1 = [k for k in
[(a,[j for j in b if j in de2])
for a,b in [ri.normalizedPairedEndsReads(i)
for i in e1 if i > 0 and i < lri]] if k[1]]
pe2 = [k for k in
[(a,[j for j in b if j in de1])
for a,b in [ri.normalizedPairedEndsReads(i)
for i in e2 if i < 0 and -i < lri]] if k[1]]
le1=len(e1)
peset = set()
delta = []
for f,rs in pe1:
for r in rs:
p = (f,abs(r)) if f < abs(r) else (abs(r),-f)
if p not in peset:
peset.add(p)
p1 = de1[f]
p2 = de2[r]
delta.append((le1 - p1) + 1 + p2)
for f,rs in pe2:
for r in rs:
p = (-f,abs(r)) if -f < abs(r) else (abs(r),f)
if p not in peset:
peset.add(p)
p1 = de1[r]
p2 = de2[f]
delta.append(le1 - p1 + 1 + p2)
if (len(delta) > 1):
dmean = sum(delta) / len(delta)
variance = sum((x -dmean)** 2 for x in delta) / (len(delta) - 1)
dsd = math.sqrt(variance)
dmean+=ri.getReadSize()
else:
dmean = None
dsd = None
return len(peset),dmean,dsd,delta
[docs]def coverageEstimate(self,matches=None,index=None,timeout=60.0):
'''
Estimates the average coverage depth of the sequence.
The algorithm is masic and can be very slow. To avoid infinity
computation time a timeout limits it to 60 secondes by default.
Three values are returned by the function :
- The number of bp considered to estimate the coverage
- The length of the segment used for the estimation
- The coverage depth
:param timeout: Maximum computation time.
:return: a triplet (int,int,float)
'''
def weight(a,b):
maxw=0
for e in cg.edgeIterator(edgePredicate = lambda i:i[0]==a and i[1]==b):
attr = cg.getEdgeAttr(*e)
w = attr['length'] * attr['weight']
if w > maxw:
maxw=w
return maxw
def coverage(start):
wp = cg.minpath(start, distance=weight,allowCycle=True)
# print ">>>",start,wp
maxpath = max(wp[0].values())
maxend=[i for i in wp[0] if wp[0][i]==maxpath][0]
i=maxend
path=[i]
if i == start and wp[0][i] > 0:
i=wp[1][i]
path.append(i)
while i!=start:
i=wp[1][i]
path.append(i)
return(maxpath,path)
cg = self.compactAssembling(verbose=False)
if matches is not None and index is not None:
genesincontig(cg, index, matches)
# A first and fast approach is to look for long sequence (> 1kb)
longweight = [(cg.getEdgeAttr(*i)['weight'],cg.getEdgeAttr(*i)['length'])
for i in cg.edgeIterator(edgePredicate=lambda e:'weight' in cg.getEdgeAttr(*e))
if cg.getEdgeAttr(*i)['length'] > 1000 and cg.getEdgeAttr(*i)['ingene']>0]
else:
# A first and fast approach is to look for long sequence (> 1kb)
longweight = [(cg.getEdgeAttr(*i)['weight'],cg.getEdgeAttr(*i)['length'])
for i in cg.edgeIterator(edgePredicate=lambda e:'weight' in cg.getEdgeAttr(*e))
if cg.getEdgeAttr(*i)['length'] > 1000]
if longweight:
maxpath = sum(i[1] for i in longweight)
cov = weightedMode((i[0],i[1]//100) for i in longweight)
return maxpath,maxpath,cov
# Seconde strategy : we look for the longest of the shortest path
maxpath=0
path=None
btime = time()
n=list(cg)
# n = [i for i in cg if len(list(cg.parentIterator(i)))==0]
# dest = [i for i in cg if len(list(cg.neighbourIterator(i)))==0]
e = [max([cg.getEdgeAttr(*j)['length']
for j in list(cg.edgeIterator(edgePredicate=lambda i: i[1]==k))]+[0])
for k in n]
ne = list(map(lambda a,b:(a,b),n,e))
ne.sort(key=lambda i:i[1],reverse=True)
n = [i[0] for i in ne]
for i in n:
w,p = coverage(i)
if w > maxpath:
maxpath=w
path=p
if (time()-btime) > timeout:
break
# print path
if path:
path.reverse()
j = path[0]
cumlength=0
for i in path[1:]:
maxw=0
maxlength=0
for e in cg.edgeIterator(edgePredicate = lambda k:k[0]==j and k[1]==i):
attr = cg.getEdgeAttr(*e)
w = attr['length'] * attr['weight']
# print w , attr['length'] , attr['weight']
if w > maxw:
maxw=w
maxlength=attr['length']
cumlength+=maxlength
j=i
else:
l = [(cg.getEdgeAttr(*e)['length'])
for e in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e))]
w = [(cg.getEdgeAttr(*e)['weight'])
for e in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e))]
if l:
maxpath=max(l)
coverage = w[l.index(maxpath)]
return maxpath,maxpath,coverage
else:
return None,None,None # <-- ???
return maxpath,cumlength,maxpath/float(cumlength)
[docs]def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=200,logger=None,tags=[]):
'''
Convert a path in an compact assembling graph in a fasta formated sequences.
:param assgraph: The compact assembling graph as produced by the
:py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
:type assgraph: :py:class:`~orgasm.graph.DiGraphMultiEdge`
:param path: an ``iterable`` providing an ordered list of ``stemid`` indicating
the path to follow.
:type path: an ``iterable`` over :py:class:`int`
:param identifier: the identifier used in the header of the fasta formated sequence
:type identifier: :py:class:`bytes`
:param minlink: the minimum count of pair ended link to consider
for asserting the relationship
:type minlink: :py:class:`int`
:param nlength: how many ``N`` must be added between two segment of sequences only connected
by pair ended links
:type nlength: :py:class:`int`
:param back: How many base pairs must be considered at the end of each edge
:type back: :py:class:`int`
:returns: a string containing the fasta formated sequence
:rtype: :py:class:`bytes`
:raises: :py:class:`AssertionError`
'''
frglen,frglensd = estimateFragmentLength(self) # @UnusedVariable
#
# Build a dictionary with:
# - Keys = stemid
# - Values = edge descriptor (from,to,x)
#
alledges = dict((assgraph.getEdgeAttr(*e)['stemid'],e)
for e in assgraph.edgeIterator(edgePredicate = lambda i: 'stemid' in assgraph.getEdgeAttr(*i)))
coverage=[]
slength=[]
seq=[]
label=[]
oldstem=None
oldid=None
oldstemclass=None
rank = 1
forceconnection=False
for stemid in path:
if stemid != 0:
stem = alledges[stemid]
attr = assgraph.getEdgeAttr(*stem)
stemclass = attr['class']
sequence = attr['sequence']
if rank==1 and stemclass!="sequence":
raise RuntimeError("A path cannot start on a gap")
# Switch the stem to a dashed style
graphics = attr.get('graphics',{})
attr['graphics'] = graphics
graphics['style'] = 'dashed'
# Manage step rank information of each step
allsteps = attr.get('steps',{})
steps = allsteps.get(identifier,[])
steps.append(rank)
allsteps[identifier]=steps
attr['steps']=allsteps
if oldstem is not None:
connected,ml,sl,delta = pairEndedConnected(self,assgraph,oldid,stemid,back) # @UnusedVariable
if oldstem[1]==stem[0]:
if oldstemclass=="sequence":
if stemclass=="sequence": # Link between 2 sequences
if ml is not None:
logOrPrint(logger,
"Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" %
(oldid,stemid,connected,float(ml),float(sl)))
label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
else:
logOrPrint(logger,
"Both segments %d and %d are connected but covered by 0 paired-end" %
(oldid,stemid))
label.append('{connection: 0}')
elif stemclass[0:9]=="scaffold:": # Link a sequence and a gap
logOrPrint(logger,
"Both segments %d and %d are disconnected" % attr['link'])
if stemclass=="scaffold:paired-end":
logOrPrint(logger,
" But linked by %d pair ended links (gap length=%f sd=%f)" %
(attr['pairendlink'],
attr['length'],
attr['gapsd']))
label.append('{N-connection: %d - Gap length: %d, sd: %d}' %
(attr['pairendlink'],
attr['length'],
attr['gapsd']))
elif stemclass=="scaffold:forced":
logOrPrint(logger," Connection is forced")
if attr['pairendlink'] > 0:
logOrPrint(logger,
" But asserted by %d pair ended links (gap length=%f sd=%f)" %
(attr['pairendlink'],
attr['length'],
attr['gapsd']))
label.append('{F-connection: %d - Gap length: %d, sd: %d}' %
(attr['pairendlink'],
attr['length'],
attr['gapsd']))
else:
label.append('{F-connection: %d}' % connected)
elif stemclass=="scaffold:overlap":
logOrPrint(logger,
" But overlap by %dbp supported by %d pair ended links" %
(-attr['length'],
attr['pairendlink']))
label.append('{O-connection: %d - Overlap length: %d}' %
(attr['pairendlink'],
-attr['length']))
# Remove the overlap length on the last inserted sequence
seq[-1]=seq[-1][:attr['length']]
elif oldstemclass[0:9]=="scaffold:":
if stemclass=="sequence":
seq.append(self.index.getRead(attr['path'][0],
0,
self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
else:
raise RuntimeError('A scaffold link must be followed by a sequence %d --> %d' %
(oldid,stemid))
elif forceconnection:
logOrPrint(logger," Connection is forced")
if connected > 0:
glength = int(frglen-ml - self.index.getReadSize())
logOrPrint(logger,
" But asserted by %d pair ended links (gap length=%f sd=%f)" %
(connected,
glength,
sl))
label.append('{F-connection: %d - Gap length: %d, sd: %d}' %
(connected,
glength,
sl))
else:
logOrPrint(logger,
"Without any support from pair ended links")
label.append('{F-connection: %d}' % connected)
glength = nlength
seq.append(b'N'*int(glength))
slength.append(int(glength))
seq.append(self.index.getRead(attr['path'][0],
0,
self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
# Add the foced link to the compact assembly graph
flink = assgraph.addEdge(oldstem[1],stem[0])
rlink = assgraph.addEdge(-stem[0],-oldstem[1])
flink['label']="Forced (%d) %d -> %d" % (connected,oldid,stemid)
flink['graphics']={'width':1,
'arrow':'last',
'fill':'0xFF0000',
'style':'dashed'
}
rlink['label']="Forced (%d) %d -> %d" % (connected,-stemid,-oldid)
rlink['graphics']={'width':1,
'arrow':'last',
'fill':'0xFF0000',
'style':'dashed'
}
else:
raise AssertionError('Disconnected path between stem '
'%d and %d only %d pair ended links' % (oldid,stemid,connected))
if stemclass=="sequence":
if attr['length'] > 10:
attr['label']="%d : %s->(%d)->%s [%d] @ %s" % (stemid,sequence[0:5].decode('ascii'),
attr['length'],
sequence[-5:].decode('ascii'),
int(attr['weight']),
attr['steps'])
else:
attr['label']="%d : %s->(%d) [%d] @ %s" % (stemid,
sequence.decode('ascii'),
attr['length'],
int(attr['weight']),
attr['steps'])
label.append(attr['label'])
seq.append(sequence)
coverage.append(attr['weight'])
slength.append(attr['length'])
rank+=1
oldstem = stem
oldid=stemid
oldstemclass=stemclass
forceconnection=False
else:
forceconnection=True
s1 = alledges[path[-1]]
s2 = alledges[path[0]]
sid1=assgraph.getEdgeAttr(*s1)['stemid']
sid2=assgraph.getEdgeAttr(*s2)['stemid']
sclass2=assgraph.getEdgeAttr(*s2)['class']
connected,ml,sl,delta = pairEndedConnected(self, # @UnusedVariable
assgraph,
sid1,
sid2,
back)
if s1[1]==s2[0]:
if ml is not None:
logOrPrint(logger, "Path is circular and connected by %d (length: %d, sd: %d)" %
(connected,int(ml),int(sl))
)
else:
logOrPrint(logger, "Path is circular")
circular=True
if ml is not None:
if sclass2=="sequence":
label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
else:
if sclass2!="sequence":
raise RuntimeError("A path cannot ends on a gap")
if forceconnection:
logOrPrint(logger,"Circular connection forced",)
logOrPrint(logger,"Linked by %d pair ended links" % connected)
label.append('{F-connection: %d}' % connected)
seq.append(b'N'*int(nlength))
circular=True
else:
logOrPrint(logger,"Path is linear")
circular=False
seq.insert(0,self.index.getRead(s2[0],0,self.index.getReadSize()).lower())
slength.insert(0,self.index.getReadSize())
sequence = b''.join(seq)
length = sum(slength)
mcov = oneXcoverage(assgraph)
fasta=[">%s seq_length=%d; coverage=%5.1f; circular=%s; %s%s" % (identifier,length,
mcov,circular,
tags2str(tags) + " ",
'.'.join(label))]
fasta.extend(sequence[i:i+60].decode('ascii')
for i in range(0,len(sequence),60)
)
return "\n".join(fasta)
def pathConstraints(self,cg,threshold=5.,back=500, minlink=5):
def minisatDoubleConstraints(self,cg):
e = [i for i in cg.edgeIterator(edgePredicate=lambda j: len(list(cg.edgeIterator(edgePredicate=lambda k: k[0]==j[1] and k[1]==j[0] and 'stemid' in cg.getEdgeAttr(*k))))>0) if abs(i[0]) < abs(i[1])]
constraints = {}
for start,end,index in e: # @UnusedVariable
input1 = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==start and i[0]!=end and 'stemid' in cg.getEdgeAttr(*i)))
input2 = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==end and i[0]!=start and 'stemid' in cg.getEdgeAttr(*i)))
output1 = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==end and i[1]!=start and 'stemid' in cg.getEdgeAttr(*i)))
output2 = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==start and i[1]!=end and 'stemid' in cg.getEdgeAttr(*i)))
if input1 and output1:
middleF = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==start and j[1]==end and 'stemid' in cg.getEdgeAttr(*j)))
middleR = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==end and j[1]==start and 'stemid' in cg.getEdgeAttr(*j)))
inedge = input1
output= output1
else:
middleF = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==end and j[1]==start and 'stemid' in cg.getEdgeAttr(*j)))
middleR = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==start and j[1]==end and 'stemid' in cg.getEdgeAttr(*j)))
inedge = input2
output= output2
if len(inedge)==1 and len(middleF)==1 and len(middleR)==1 and len(output)==1:
inedge = inedge[0]
middleF = middleF[0]
middleR = middleR[0]
output = output[0]
inputcov = cg.getEdgeAttr(*inedge)['weight']
middleFcov = cg.getEdgeAttr(*middleF)['weight']
middleRcov = cg.getEdgeAttr(*middleR)['weight']
outputcov = cg.getEdgeAttr(*output)['weight']
r1 = round(middleFcov/float(inputcov))
r2 = round(middleRcov/float(inputcov)) + 1
r3 = round(middleFcov/float(outputcov))
r4 = round(middleRcov/float(outputcov)) + 1
r = int(round((r1+r2+r3+r4)/4)) - 1
constraints[cg.getEdgeAttr(*inedge)['stemid']] = [[cg.getEdgeAttr(*middleF)['stemid'],cg.getEdgeAttr(*middleR)['stemid']] * r + \
[cg.getEdgeAttr(*middleF)['stemid'],cg.getEdgeAttr(*output)['stemid']]]
return constraints
def minisatSimpleConstraints(self,cg):
e = [i for i in cg.edgeIterator(edgePredicate=lambda j: j[0]==j[1] and 'stemid' in cg.getEdgeAttr(*j))]
constraints = {}
for start,end,index in e:
inedge = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==start and i[0]!=end and 'stemid' in cg.getEdgeAttr(*i)))
output = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==end and i[1]!=start and 'stemid' not in cg.getEdgeAttr(*i)))
middle = (start,end,index)
if len(inedge)==1 and len(output)==1:
inedge = inedge[0]
output = output[0]
inputcov = cg.getEdgeAttr(*inedge)['weight']
middlecov = cg.getEdgeAttr(*middle)['weight']
outputcov = cg.getEdgeAttr(*output)['weight']
r1 = round(middlecov/float(inputcov))
r2 = round(middlecov/float(outputcov))
r = int(round((r1+r2)/2))
constraints[cg.getEdgeAttr(*inedge)['stemid']] = [[cg.getEdgeAttr(*middle)['stemid']] * r + \
[cg.getEdgeAttr(*output)['stemid']]]
return constraints
def pairEndedConstraints(asm,cg,threshold=5.,back=500):
fork = [e for e in cg.edgeIterator(edgePredicate=lambda i: \
'stemid' in cg.getEdgeAttr(*i) and
len(list(cg.parentIterator(i[0],
edgePredicate=lambda j: j[0]!=i[1] and 'stemid' in cg.getEdgeAttr(*j))
)
)==2
and len(list(cg.neighbourIterator(i[1],
edgePredicate=lambda j:j[1]!=i[0] and 'stemid' in cg.getEdgeAttr(*j))
)
)==2
)
]
bifork = [([cg.getEdgeAttr(*i)['stemid'] for i in
cg.edgeIterator(edgePredicate=lambda j: j[1]==e[0] and j[0]!=e[1] and 'stemid' in cg.getEdgeAttr(*j))],
cg.getEdgeAttr(*e)['stemid'],
[cg.getEdgeAttr(*i)['stemid'] for i in
cg.edgeIterator(edgePredicate=lambda j: j[0]==e[1] and j[1]!=e[0] and 'stemid' in cg.getEdgeAttr(*j))]) for e in fork]
links = [(i,
(pairEndedConnected(asm,cg,i[0][0],i[2][0],back)[0]+
pairEndedConnected(asm,cg,i[0][1],i[2][1],back)[0]+0.0001)/
(pairEndedConnected(asm,cg,i[0][0],i[2][1],back)[0]+
pairEndedConnected(asm,cg,i[0][1],i[2][0],back)[0]+0.0001))
for i in bifork]
constraints = {}
for (starts,middle,ends),ratio in links:
if ratio >= threshold:
constraints[starts[0]]=[middle,ends[0]]
constraints[starts[1]]=[middle,ends[1]]
elif ratio <= 1/threshold:
constraints[starts[0]]=[middle,ends[1]]
constraints[starts[1]]=[middle,ends[0]]
changed=True
while changed:
n = iter(constraints)
changed=False
while not changed:
try:
k=next(n)
nk=constraints[k][-1]
if nk in constraints:
constraints[k].extend(constraints[nk])
# del constraints[nk]
changed=True
except StopIteration:
break
for k in constraints:
constraints[k]=[constraints[k]]
return constraints
# def scaffoldConstraints(self,cg,back=500, minlink=5):
# ends = [cg.getEdgeAttr(*i)['stemid']
# for i in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e)
# and len(list(cg.neighbourIterator(e[1],
# edgePredicate=lambda i: 'stemid' in cg.getEdgeAttr(*i)
# )
# ))==0)]
# starts= [cg.getEdgeAttr(*i)['stemid']
# for i in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e) and len(list(cg.parentIterator(e[0], edgePredicate=lambda i: 'stemid' in cg.getEdgeAttr(*i))))==0)]
#
# constraints = {}
#
# for e in ends:
# for s in starts:
# if pairEndedConnected(self,cg,e,s,back)[0] >= minlink:
# if e in constraints:
# constraints[e].append([s])
# else:
# constraints[e]=[[s]]
# return constraints
def mergeConstraints(c1,c2):
for k in c2:
if k in c1:
c1[k].extend(c2[k])
else:
c1[k]=c2[k]
return c1
constraints = {}
constraints = mergeConstraints(constraints,minisatSimpleConstraints(self,cg))
constraints = mergeConstraints(constraints,minisatDoubleConstraints(self,cg))
constraints = mergeConstraints(constraints,pairEndedConstraints(self,cg,threshold,back))
# constraints = mergeConstraints(constraints,scaffoldConstraints(self,cg,back,minlink))
interns = set()
for e in constraints:
for p in constraints[e]:
for n in p[0:-1]:
interns.add(n)
for e in cg.edgeIterator(edgePredicate=lambda i: 'stemid' in cg.getEdgeAttr(*i)):
stemid = cg.getEdgeAttr(*e)['stemid']
if stemid not in constraints and stemid not in interns:
constraints[stemid]=[[cg.getEdgeAttr(*n)['stemid']]
for n in cg.edgeIterator(edgePredicate=lambda i : 'stemid' in cg.getEdgeAttr(*i) and i[0]==e[1])]
return constraints
def estimateDeadBrancheLength(self,default=10):
cg = self.compactAssembling(verbose=False)
smallbranches = [i for i in [(cg.getEdgeAttr(*e)['length'])
for e in cg.edgeIterator(edgePredicate=lambda e:
len(list(cg.neighbourIterator(e[1])))==0
or len(list(cg.parentIterator(e[0])))==0)]
if i > 1 and i < 100]
msmallbranches = mode(smallbranches)
# print msmallbranches, smallbranches
# looks for small branches on the side of the main assembly
if msmallbranches is not None and msmallbranches < 100:
msmallbranches=msmallbranches*2
else:
msmallbranches=default
return msmallbranches
def selectGoodComponent(cg):
def geneincc(cc):
return any(cg.getEdgeAttr(*e).get('ingene',0)>0 for e in cc)
cc = list(cg.connectedComponentIterator())
goodcc=[]
for cc1 in cc:
ccp = set(-i for i in cc1)
if ccp not in goodcc:
goodcc.append(cc1)
cclength = [[cg.getEdgeAttr(*i)['length']
for i in cg.edgeIterator(nodePredicate=lambda n: n in cc1)
if 'stemid' in cg.getEdgeAttr(*i)
] for cc1 in goodcc]
ccmlength=[(sum(i)/float(len(i)),len(i)) for i in cclength]
m = weightedMode(ccmlength)
if m < 1000:
gcc = [list(cg.edgeIterator(nodePredicate=lambda n:n in c,
edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e)))
for c,(mcc,lcc) in map(lambda a,b:(a,b),goodcc,ccmlength) # @UnusedVariable
if mcc > m * 2]
else:
gcc = [list(cg.edgeIterator(nodePredicate=lambda n:n in c,
edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e)))
for c,(mcc,lcc) in map(lambda a,b:(a,b),goodcc,ccmlength) ] # @UnusedVariable
gcc = [cc for cc in gcc if geneincc(cc)]
return gcc
def unfoldmarker(assgraph,seeds=None):
if seeds is None:
cc = list(assgraph.edgeIterator(edgePredicate = lambda e: 'stemid' in assgraph.getEdgeAttr(*e)))
else:
cc = [e for e in seeds if 'stemid' in assgraph.getEdgeAttr(*e)]
sources = [x[0] for x in cc if assgraph.getEdgeAttr(*x)['ingene']>0]
dests = [x[1] for x in cc if assgraph.getEdgeAttr(*x)['ingene']>0]
maxp=[]
maxpp=0
for s in sources:
for d in dests:
p = assgraph.minpath(s,[d])
i = d
ep = [i]
while i != s and i in p[1]:
i=p[1][i]
ep.append(i)
ep.reverse()
f = ep[0]
pp=[]
for e in ep[1:]:
es = list(assgraph.edgeIterator(edgePredicate=lambda i:i[0]==f and i[1]==e))
ea = [assgraph.getEdgeAttr(*i) for i in es]
ev = max([a['length'] * a['ingene'] for a in ea]+[0])
pp.append(ev)
f=e
pp = sum(pp)
if pp > maxpp or (pp==maxpp and len(ep) < len(maxp)):
maxp=ep
maxpp=pp
cont = True
while cont:
p = set(assgraph.parentIterator(maxp[0]))
if len(p)==1:
sid = p.pop()
if sid not in maxp:
maxp.insert(0,sid)
else:
cont=False
else:
cont=False
cont = True
while cont:
p = set(assgraph.neighbourIterator(maxp[-1]))
if len(p)==1:
sid = p.pop()
if sid not in maxp:
maxp.append(sid)
else:
cont=False
else:
cont=False
f = maxp[0]
ep=[]
for e in maxp[1:]:
es = list(assgraph.edgeIterator(edgePredicate=lambda i:i[0]==f and i[1]==e))
ea = [assgraph.getEdgeAttr(*i) for i in es]
ge = es[0]
a = assgraph.getEdgeAttr(*ge)
if 'stemid' in a:
gw = a['weight']*a['length']
for i in es[1:]:
a = assgraph.getEdgeAttr(*i)
if 'stemid' in a:
w = a['weight']*a['length']
if w > gw:
gw=w
ge=i
sid = assgraph.getEdgeAttr(*ge)['stemid']
ep.append(sid)
f=e
return ep
def oneXcoverage(assgraph):
readSize = assgraph.assembler.index.getReadSize()
icov = []
for e in assgraph.edgeIterator():
attr = assgraph.getEdgeAttr(*e)
if attr['class'] == "sequence" and attr['length'] > readSize:
weight = attr['weight']
length = attr['length']
icov.append((weight,length))
return weightedMode(icov)
[docs]def unfoldAssembling(self,assgraph,constraints=None,
seeds=None,threshold=5.,back=500,
minlink=5, limitSize=0,
circular=False,
force=False,
cov1x=None,
logger=None):
'''
:param assgraph:
:param constraints:
:param seeds: set of stem to use as seed for the unfolding algorithm
:param threshold:
:param back:
:param minlink:
:param limitSize: maximum size of the contig in base pair
:param circular: if TRUE, we hope to get a circular contig
:param force: if TRUE, we ask for a circular contig
:param logger:
'''
def addStepToAPath(path,step,limitSize):
nw = dict(path[0])
np = list(path[2])
iw = path[3]
nl = path[4]
np.append(step)
if abs(step) in nw:
nw[abs(step)]-=iw
ns=sum((nw[x]* length[x])**2 for x in nw)
nl+=length[abs(step)]
if ns > path[1] or (limitSize > 0 and nl > limitSize):
rep = path
complete=True
else:
rep = [nw,ns,np,iw,nl]
complete=False
return complete,rep
allattrs = [assgraph.getEdgeAttr(*i)
for i in assgraph.edgeIterator()]
attrs = dict((abs(i['stemid']),(i['length'],i['weight']))
for i in allattrs
if i['class']=="sequence"
)
allattrs = dict((abs(i['stemid']),(i['length'],i['weight']))
for i in allattrs
if 'stemid' in i
)
#si : stem id
if seeds is None:
si = set(attrs.keys())
else:
si = set(assgraph.getEdgeAttr(*e)['stemid']
for e in seeds
if assgraph.getEdgeAttr(*e)['class']=="sequence")
if constraints is None:
constraints = pathConstraints(self,
assgraph,
threshold=threshold,
back=back,
minlink=minlink)
stems = dict((assgraph.getEdgeAttr(*i)['stemid'],i)
for i in assgraph.edgeIterator(edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e)))
weight=dict((abs(k),allattrs[abs(k)][1]) for k in attrs)
length=dict((abs(k),allattrs[abs(k)][0]) for k in stems)
# paths if the collection of on construction path
# each path is constituted of five elements :
# - [0] the weigth directory
# - [1] the score of the path which have to be minimized
# - [2] the actual path as a sequence of stem ids
# - [3] the weight of the first stem in the path
# - [4] the total length of the path
if cov1x is None:
covone=oneXcoverage(assgraph)
logOrPrint(logger,"Coverage 1x estimated = %d" % covone)
else:
covone=cov1x
logOrPrint(logger,"Coverage set by user 1x = %d" % covone)
# paths = [[dict(weight),0,(i,),0,0] for i in si]
paths = [[dict(weight),0,(i,),covone,0] for i in si]
for p in paths:
# p[3] = p[0][abs(p[2][0])] # <--- Key error bug
p[0][abs(p[2][0])]=0
p[1]=sum((p[0][x]*length[x])**2 for x in p[0])
p[4]=length[abs(p[2][0])]
completePath=[]
while paths:
# I try to extend paths with constrains
newpath=[]
for p in paths:
last = p[2][-1]
complete=False
if last in constraints:
if len(constraints[last])==0:
completePath.append(p)
for steps in constraints[last]:
lp = p
for step in steps:
complete,lp=addStepToAPath(lp,step,limitSize)
if complete:
completePath.append(lp)
break
if not complete:
newpath.append(lp)
paths = newpath
newpath=[]
cp={}
for p in completePath:
first = stems[p[2][0]][0]
end = stems[p[2][-1]][1]
closed = first==end
if not closed:
if circular:
# We try to close the graph
c = assgraph.minpath(end,
[first],
edgePredicate=lambda e: assgraph.getEdgeAttr(*e)['class']=="sequence"
)
i = first
ep = [i]
while i != end and i in c[1]:
i=c[1][i]
ep.append(i)
np = list(p[2])
if circular:
for x in range(len(ep)-1,0,-1):
s = assgraph.getEdgeAttr(ep[x],ep[x-1],0)['stemid']
p[0][abs(s)]-=p[3]
np.append(s)
ns = sum((p[0][x]* length[x])**2 for x in p[0])
first = stems[np[0]][0]
end = stems[np[-1]][1]
closed = first==end
if not closed and force:
closed = True
np.append(0)
np = tuple(np)
else:
ns = p[1]
np = p[2]
cp[tuple(np)]=(p[0],ns,p[3],closed)
cp = [(x,cp[x][1],cp[x][-1]) for x in cp]
cp.sort(key=lambda x:x[1],reverse=True)
return cp
def estimateFragmentLength(self,minlength=1000):
cg = self.compactAssembling(verbose=False)
edges = [i for i in cg.edgeIterator() if cg.getEdgeAttr(*i)['length']>=minlength]
lindex=len(self.index)
r=self.index
delta=[]
for e in edges:
stem=cg.getEdgeAttr(*e)['path']
si = dict((stem[i],i) for i in range(len(stem)))
for p in stem:
if p > 0 and p < lindex:
delta.extend(si[i]-si[p] for i in r.normalizedPairedEndsReads(p)[1]
if i in si and si[i] > si[p])
if len(delta) > 1:
mdelta = sum(delta) / float(len(delta))
variance= sum((d - mdelta)**2 for d in delta) / (len(delta)-1)
sdelta = math.sqrt(variance)
return (mdelta+self.index.getReadSize(),sdelta)
else:
return None,None
def dumpGraph(fileout,asm):
graph = asm.graph
index = asm.index
readsize = index.getReadSize()
edgeIterator = graph.edgeIterator()
if isinstance(fileout, (str,bytes)):
fileout = open(fileout,"w")
for i in range(index.fakes.firstid,index.fakes.lastid):
print('F %d %s' % (i,index.getRead(i,0,readsize).decode('ascii')),
file=fileout
)
for f,t,l in edgeIterator: # @UnusedVariable
if abs(f) < abs(t):
ea = asm.graph.getEdgeAttr(f,t,0)
print("E",f,t,ea['coverage'],file=fileout)
def restoreGraph(filein,index,matches=None):
asm = Assembler(index)
readmax = len(index) + 1
graph = asm.graph
gene = matchtogene(matches)
if isinstance(filein, str):
filein = open(filein)
for l in filein:
#print(len(graph))
if l[0]=="F":
t,i,s = l.strip().split()
i=int(i)
s=bytes(s,encoding='ascii')
rids = index.getReadIds(s)
assert rids[0]==i
elif l[0]=='E':
t,f,t,c = l.strip().split()
f = int(f)
t = int(t)
c = int(c)
if f not in graph:
node = graph.addNode(f)
node['gene']=gene.get(abs(f),None)
node['cycle']=0
if f < readmax:
node['fake5']=0
node['fake3']=0
else:
node['graphics']={'fill':"#00FF00"}
if t not in graph:
node = graph.addNode(t)
node['gene']=gene.get(abs(t),None)
node['cycle']=0
if f < readmax:
node['fake5']=0
node['fake3']=0
else:
node['graphics']={'fill':"#00FF00"}
edges = graph.addEdge(f,t)
edges[1]['coverage']=c
edges[2]['coverage']=c
edges[1]['label']="%s (%d)" % (edges[1]['ext'],c)
edges[2]['label']="%s (%d)" % (edges[2]['ext'],c)
return asm
# Experimental section, unused functions
[docs]def fillGaps2(self,minlink=5,
back=200,
kmer=12,
smin=40,
delta=0,
cmincov=5,
minread=20,
minratio=0.1,
emincov=1,
maxlength=None,
gmincov=1,
minoverlap=60,
lowfilter=True,
adapters5=(),
adapters3=(),
maxjump=0,
snp=False,
nodeLimit=1000000,
onlyfill=False):
'''
:param minlink:
:param back:
:param kmer:
:param smin:
:param delta:
:param cmincov:
:param minread:
:param minratio:
:param emincov:
:param maxlength:
:param gmincov:
:param minoverlap:
:param lowfilter:
:param maxjump:
:param snp: If set to True (default value is False) erase SNP variation
by conserving the most abundant version
'''
global __cacheAli
global __cacheAli2
__cacheAli = __cacheAli2
__cacheAli2 = set()
def isInitial(n):
return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
def isTerminal(n):
return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
assgraph = self.compactAssembling(verbose=False)
index = self.index
ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
epi = [set(assgraph.getEdgeAttr(*i)['path'][0:back]) for i in ei]
ept = [set(assgraph.getEdgeAttr(*i)['path'][-back:]) for i in et]
eei = [set(assgraph.getEdgeAttr(*i)['path'][0:100]) for i in ei]
eet = [set(assgraph.getEdgeAttr(*i)['path'][-100:]) for i in et]
exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid]
ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid]
nei = len(ei)
net = len(et)
s=[]
maxcycle=max(self.graph.getNodeAttr(i)['cycle'] for i in self.graph)
lassemb = len(self)
cycle = maxcycle
linked=set()
extended=set()
for e1 in range(net):
for e2 in range(nei):
connected = et[e1][1]==ei[e2][0]
if not connected:
linkedby,ml,sl,pdelta = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back) # @UnusedVariable
if linkedby >= minlink and abs(etid[e1]) <= abs(eiid[e2]):
extended.add(etid[e1])
extended.add(-eiid[e2])
if (etid[e1],eiid[e2]) not in linked:
linked.add((-eiid[e2],-etid[e1]))
print("\n\nLinking Stems %d -> %d" % (etid[e1],eiid[e2]),
file=sys.stderr)
ex = frozenset(((ext[e1] | exi[e2]) - ept[e1] - epi[e2]) | eet[e1] | eei[e2])
ingraph = sum(i in self.graph for i in ex)
nreads = len(ex)
if ingraph < nreads:
print ("--> %d | %d = %d reads to align (%d already assembled)" % (len(ext[e1]),len(exi[e2]),nreads,ingraph),
file=sys.stderr)
if nreads > 10:
__cacheAli2.add(ex)
if ex not in __cacheAli:
ali= multiAlignReads(ex,index,kmer,smin,delta)
print('',file=sys.stderr)
goodali = [i for i in ali if len(i) >= nreads/4]
print("--> %d consensus to add" % len(goodali),
file=sys.stderr)
for a in goodali:
cycle+=1
c = consensus(a,index,cmincov)
s = insertFragment(self,c,cycle=cycle)
print(" %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
file=sys.stderr)
a = tango(self,
seeds = s,
minread = minread,
minratio = minratio,
mincov = emincov,
minoverlap = minoverlap,
lowfilter = lowfilter,
adapters5 = adapters5,
adapters3 = adapters3,
maxjump = maxjump,
cycle = cycle,
nodeLimit = nodeLimit)
print('',file=sys.stderr)
else:
print("--> already aligned",file=sys.stderr)
if (not onlyfill):
for e1 in range(net):
if etid[e1] not in extended:
print("\n\nExtending Stems %d" % (etid[e1]),
file=sys.stderr)
ex = frozenset((ext[e1] - ept[e1]) | eet[e1])
nreads = len(ex)
print("--> %d reads to align" % (nreads),
file=sys.stderr)
if nreads > 10:
__cacheAli2.add(ex)
if ex not in __cacheAli:
ali= multiAlignReads(ex,index,kmer,smin,delta)
print('',file=sys.stderr)
goodali = [i for i in ali if len(i) >= nreads/4]
print("--> %d consensus to add" % len(goodali),
file=sys.stderr)
for a in goodali:
c = consensus(a,index,cmincov)
if c:
cycle+=1
s = insertFragment(self,c,cycle=cycle)
print(" %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
file=sys.stderr)
a = tango(self,
seeds = s,
minread = minread,
minratio = minratio,
mincov = emincov,
minoverlap = minoverlap,
lowfilter = lowfilter,
adapters5 = adapters5,
adapters3 = adapters3,
maxjump = maxjump,
cycle = cycle,
nodeLimit = nodeLimit)
print("",file=sys.stderr)
else:
print("--> already aligned",file=sys.stderr)
self.cleanDeadBranches(maxlength=10)
cutLowCoverage(self,gmincov,terminal=True)
# cutLowCoverage(self,int(gmincov/3),terminal=False)
if maxlength is not None:
smallbranches = maxlength
else:
smallbranches = estimateDeadBrancheLength(self)
print(" Dead branch length setup to : %d bp" % smallbranches,
file=sys.stderr)
self.cleanDeadBranches(maxlength=smallbranches)
if snp:
cutSNPs(self)
newnodes = len(self) - lassemb
print('',file=sys.stderr)
print("#######################################################",file=sys.stderr)
print("#",file=sys.stderr)
print("# Added : %d bp (total=%d bp)" % (newnodes/2,len(self)/2),file=sys.stderr)
print("#",file=sys.stderr)
print("#######################################################",file=sys.stderr)
print('',file=sys.stderr)
return newnodes
def cutHighCoverage(self,maxcov,terminal=True):
def isTerminal(g,n):
return len(list(g.parentIterator(n)))==0 or len(list(g.neighbourIterator(n)))==0
def endnodeset(g):
return set(g.nodeIterator(predicate = lambda n : (len(list(g.neighbourIterator(n)))==0)))
def startnodeset(g):
return set(g.nodeIterator(predicate = lambda n : (len(list(g.parentIterator(n)))==0)))
if terminal:
tstates=[True]
else:
tstates=[True,False]
ilength=len(self)
cg = self.compactAssembling(verbose=False)
index = self.index
readSize = index.getReadSize()
for terminal in tstates:
print('',file=sys.stderr)
if terminal:
print("Deleting terminal branches",file=sys.stderr)
else:
print("Deleting internal branches",file=sys.stderr)
extremities = endnodeset(cg) | startnodeset(cg)
go = True
while go:
go = False
stems = [x for x in cg.edgeIterator()
if not terminal or (isTerminal(cg, x[0]) or isTerminal(cg, x[1]))]
if stems:
stems.sort(key=lambda i:cg.getEdgeAttr(*i)['weight'])
heaviest = stems.pop()
lattr = cg.getEdgeAttr(*heaviest)
# print >>sys.stderr
# print >>sys.stderr,lattr['stemid'],len(stems),lightest[0],lightest[1],lattr['weight']
# print >>sys.stderr,lattr['weight'] < mincov
# print >>sys.stderr,isTerminal(cg, lightest[0]), isTerminal(cg, lightest[1])
# print >>sys.stderr,lattr['weight'] < mincov and ((not terminal) or isTerminal(cg, lightest[0]) or isTerminal(cg, lightest[1]))
# print >>sys.stderr
if lattr['weight'] > maxcov:
if stems:
go=True
for n in lattr['path'][1:-1]:
if n in self.graph:
del self.graph[n]
if heaviest[0] in extremities and heaviest[0] in self.graph:
del self.graph[heaviest[0]]
if heaviest[1] in extremities and heaviest[1] in self.graph:
del self.graph[heaviest[1]]
print("Remaining edges : %d node : %d" % (self.graph.edgeCount(),len(self)),
end='\r',
file=sys.stderr)
cg.deleteEdge(*heaviest)
tojoin=[]
if heaviest[0] in extremities:
del cg[heaviest[0]]
else:
tojoin.append(heaviest[0])
if heaviest[1] in extremities:
del cg[heaviest[1]]
else:
tojoin.append(heaviest[1])
# print >>sys.stderr,lightest[0] in extremities,lightest[1] in extremities,tojoin
for c in tojoin:
begin = list(cg.parentIterator(c))
end = list(cg.neighbourIterator(c))
if len(begin)==1 and len(end)==1:
begin = begin[0]
end = end[0]
e1s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==begin and e[1]==c))
e2s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==c and e[1]==end))
if len(e1s)==1 and len(e2s)==1:
e1 = e1s[0]
e2 = e2s[0]
attr1 = cg.getEdgeAttr(*e1)
attr2 = cg.getEdgeAttr(*e2)
lentot= attr1['length'] + attr2['length']
weight= float(attr1['weight'] * attr1['length'] + attr2['weight'] * attr2['length'])/lentot
# print >>sys.stderr
# print >>sys.stderr,"(",attr1['weight'],"*", attr1['length'], '+',attr2['weight'],"*",attr2['length'],")/",lentot,"=",weight
# time.sleep(1)
sequence=attr1['sequence'] + attr2['sequence']
stemid=lattr['stemid']
path=attr1['path'] + attr2['path'][1:]
graphics={'width':int(weight/readSize)+1,
'arrow':'last'}
if lentot > 10:
label="%d : %s->(%d)->%s [%d]" % (stemid,
sequence[0:5].decode('ascii'),
lentot,
sequence[-5:].decode('ascii'),
weight)
else:
label="%d : %s->(%d) [%d]" % (stemid,
sequence.decode('ascii'),
lentot,
weight)
# print >>sys.stderr,"Adding merged edge -->",stemid,begin,end,weight
attr = cg.addEdge(begin,end)
attr['sequence']=sequence
attr['stemid']=stemid
attr['path']=path
attr['length']=lentot
attr['weight']= weight
attr['graphics']=graphics
attr['label']=label
cg.deleteEdge(*e2)
cg.deleteEdge(*e1)
del cg[c]
print('',file=sys.stderr)
return ilength - len(self)
def stem2fasta(self):
assgraph = self.compactAssembling(verbose=False)
si = assgraph.edgeIterator(edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))
seqs = []
for s in si:
sattr = assgraph.getEdgeAttr(*s)
seq = sattr["sequence"]
seq = b'\n'.join([seq[i:(i+60)] for i in range(0,len(seq),60)])
stemid = sattr['stemid']
if stemid < 0:
identifier = "edge_%d_comp" % -stemid
else:
identifier = "edge_%d" % stemid
title = ">%s stemid=%d; seq_length=%d; coverage=%d; %s\n" % (identifier,stemid,sattr['length'],sattr['weight'],sattr['label'])
seqs.append(title + seq.decode('ascii'))
return '\n'.join(seqs)
def parseFocedScaffold(parametters):
f = [x.split(":") for x in parametters]
f = set((int(x[0]),int(x[1])) for x in f if len(x)==2)
r = set((-x[1],-x[0]) for x in f)
return f | r