I'm trying to write a function (in Python) how many different K-grams a string has. How can I compute it using a suffix array and a LCP array (I already have them computed)?
I'm trying to write a function (in Python) how many different K-grams a string has. How can I compute it using a suffix array and a LCP array (I already have them computed)?
Having LCP array lc, we should walk through it and count values less than k, whilst corresponding suffix have enough length (at least k, this checking requires suffix array sa value):
from collections import defaultdict
def sort_bucket(s, bucket, order):
d = defaultdict(list)
for i in bucket:
key = s[i + order // 2:i + order]
d[key].append(i)
result = []
for k, v in sorted(d.items()):
if len(v) > 1:
result += sort_bucket(s, v, 2 * order)
else:
result.append(v[0])
return result
def suffix_array_ManberMyers(s):
return sort_bucket(s, range(len(s)), 1)
def lcp_kasai(s, suffarr):
n = len(suffarr)
k = 0
lcp = [0] * n
rank = [0] * n
for i in range(n):
rank[suffarr[i]] = i
for i in range(n):
if (k>0):
k -= 1
if(rank[i]==n-1):
k = 0
continue
j = sa[rank[i]+1]
while((i+k<n) & (j+k<n) & (s[i+k]==s[j+k])):
k += 1
lcp[rank[i]] = k
return lcp
def dumb(s, k):
kg = set()
for i in range(len(s)-k+1):
kg.add(s[i:i+k])
#print(sorted(kg))
return len(kg)
s = "ACGTTGCATGTCGCATGATGCATGAGAGCT$"
sa = suffix_array_ManberMyers(s)
print(sa)
lc = lcp_kasai(s, sa)
print(lc)
k = 3
distinct = 0
for i in range(1, len(lc)):
if (lc[i] < k) and (sa[i] + k < len(s)):
distinct += 1
print(dumb(s[:-1], k), distinct)
Output:
[30, 0, 24, 26, 21, 14, 17, 7, 20, 13, 6, 11, 1, 28, 23, 25, 16, 19, 12, 5, 27, 9, 2, 29, 10, 22, 15, 18, 4, 8, 3]
[0, 1, 2, 1, 4, 3, 3, 0, 5, 4, 1, 2, 1, 0, 3, 2, 1, 6, 5, 2, 1, 2, 0, 1, 1, 3, 2, 6, 2, 1, 0]
18 18
dumb function is for testing
