Last active
December 27, 2018 12:07
-
-
Save jacky860226/1d33adad858eef71bfe18120d8d69e6d to your computer and use it in GitHub Desktop.
Suffix Array + Longest Common Prefix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define F(x) (x)/3+((x)%3==1?0:tb) | |
#define G(x) (x)<tb?(x)*3+1:((x)-tb)*3+2 | |
int wa[3*maxn+5],wb[3*maxn+5],wv[3*maxn+5],c[maxn+5]; | |
inline bool c0(int *s,int a,int b){ | |
return s[a]==s[b]&&s[a+1]==s[b+1]&&s[a+2]==s[b+2]; | |
} | |
inline bool c12(int k,int *s,int a,int b){ | |
if(k==2)return s[a]<s[b]||s[a]==s[b]&&c12(1,s,a+1,b+1); | |
else return s[a]<s[b]||s[a]==s[b]&&wv[a+1]<wv[b+1]; | |
} | |
inline void radix_sort(int *s,int *a,int *b,int len,int A){ | |
static int i; | |
for(i=0;i<len;++i)wv[i]=s[a[i]]; | |
for(i=0;i<A;++i)c[i]=0; | |
for(i=0;i<len;++i)++c[wv[i]]; | |
for(i=1;i<A;++i)c[i]+=c[i-1]; | |
for(i=len-1;i>=0;--i)b[--c[wv[i]]]=a[i]; | |
} | |
void dc3(int *s,int *sa,int len,int A){ | |
int i,j,*san=sa+len,ta=0,tb=(len+1)/3,tbc=0,p; | |
int *rn=s+len; | |
s[len]=s[len+1]=0; | |
for(i=0;i<len;++i){ | |
if(i%3!=0)wa[tbc++]=i; | |
} | |
radix_sort(s+2,wa,wb,tbc,A); | |
radix_sort(s+1,wb,wa,tbc,A); | |
radix_sort(s,wa,wb,tbc,A); | |
for(p=1,rn[F(wb[0])]=0,i=1;i<tbc;++i){ | |
rn[F(wb[i])]=c0(s,wb[i-1],wb[i])?p-1:p++; | |
} | |
if(p<tbc)dc3(rn,san,tbc,p); | |
else for(i=0;i<tbc;i++)san[rn[i]]=i; | |
for(i=0;i<tbc;++i){ | |
if(san[i]<tb)wb[ta++]=san[i]*3; | |
} | |
if(len%3==1)wb[ta++]=len-1; | |
radix_sort(s,wb,wa,ta,A); | |
for(i=0;i<tbc;++i)wv[wb[i]=G(san[i])]=i; | |
for(i=0,j=0,p=0;i<ta&&j<tbc;++p){ | |
sa[p]=c12(wb[j]%3,s,wa[i],wb[j])?wa[i++]:wb[j++]; | |
} | |
for(;i<ta;++p)sa[p]=wa[i++]; | |
for(;j<tbc;++p)sa[p]=wb[j++]; | |
} | |
#undef F | |
#undef G | |
/* | |
DC3的輸入陣列s必須能儲存0~len的數值,所以設為int | |
s及sa陣列的大小必須大於3*maxn | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<algorithm> | |
#define radix_sort(x,y){\ | |
for(i=0;i<A;++i)c[i]=0;\ | |
for(i=0;i<len;++i)c[x[y[i]]]++;\ | |
for(i=1;i<A;++i)c[i]+=c[i-1];\ | |
for(i=len-1;i>=0;--i)sa[--c[x[y[i]]]]=y[i];\ | |
} | |
#define sac(r,a,b) r[a]!=r[b]||a+k>=len||r[a+k]!=r[b+k] | |
inline void suffix_array(const char *s,int len,int *sa,int *rank,int *tmp,int *c){ | |
int A='z'+1,i,k,id=0; | |
for(i=0;i<len;++i)rank[tmp[i]=i]=s[i]; | |
radix_sort(rank,tmp); | |
for(k=1;id<len-1;k<<=1){ | |
for(id=0,i=len-k;i<len;++i)tmp[id++]=i; | |
for(i=0;i<len;++i)if(sa[i]>=k)tmp[id++]=sa[i]-k; | |
radix_sort(rank,tmp); | |
std::swap(rank,tmp); | |
for(rank[sa[0]]=id=0,i=1;i<len;++i) | |
rank[sa[i]]=id+=sac(tmp,sa[i-1],sa[i]); | |
A=id+1; | |
} | |
} | |
#undef radix_sort | |
#undef sac |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<algorithm> | |
struct CMP{ | |
int len,k,*rank,a,b; | |
inline bool operator()(int i,int j){ | |
if(rank[i]!=rank[j])return rank[i]<rank[j]; | |
a=(i+=k)<len?rank[i]:-1; | |
b=(j+=k)<len?rank[j]:-1; | |
return a<b; | |
} | |
}; | |
inline void suffix_array(const char *s,int len,int *sa,int *rank,int *tmp){ | |
for(int i=0;i<len;++i){ | |
sa[i]=i; | |
rank[i]=s[i]; | |
} | |
CMP cmp={len,1}; | |
for(;;cmp.k<<=1){ | |
cmp.rank=rank; | |
std::sort(sa,sa+len,cmp); | |
tmp[sa[0]]=0; | |
for(int i=1;i<len;++i)tmp[sa[i]]=tmp[sa[i-1]]+cmp(sa[i-1],sa[i]); | |
if(tmp[sa[len-1]]==len-1)break; | |
std::swap(rank,tmp); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
h:高度數組 | |
sa:後綴數組 | |
rank:排名 | |
*/ | |
inline void suffix_array_lcp(const char *s,int len,int *h,int *sa,int *rank){ | |
for(int i=0;i<len;++i)rank[sa[i]]=i; | |
for(int i=0,k=0;i<len;++i){ | |
if(rank[i]==0)continue; | |
if(k)--k; | |
while(s[i+k]==s[sa[rank[i]-1]+k])++k; | |
h[rank[i]]=k; | |
} | |
h[0]=0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<string.h> | |
#define MAGIC(XD){\ | |
memset(sa,0,sizeof(int)*n);\ | |
memcpy(x,c,sizeof(int)*z);\ | |
XD\ | |
memcpy(x+1,c,sizeof(int)*(z-1));\ | |
for(i=0;i<n;++i){\ | |
if(sa[i]&&!t[sa[i]-1])sa[x[s[sa[i]-1]]++]=sa[i]-1;\ | |
}\ | |
memcpy(x,c,sizeof(int)*z);\ | |
for(i=n-1;i>=0;--i){\ | |
if(sa[i]&&t[sa[i]-1])sa[--x[s[sa[i]-1]]]=sa[i]-1;\ | |
}\ | |
} | |
void sais(int *s,int *sa,int *p,bool *t,int *c,int n,int z){ | |
bool neq=t[n-1]=1; | |
int nn=0,nmxz=-1,*q=sa+n,*ns=s+n,*x=c+z,lst=-1,i,j; | |
memset(c,0,sizeof(int)*z); | |
for(i=0;i<n;++i)++c[s[i]]; | |
for(i=0;i<z-1;++i)c[i+1]+=c[i]; | |
for(i=n-2;i>=0;i--)t[i]=(s[i]==s[i+1]?t[i+1]:s[i]<s[i+1]); | |
MAGIC( | |
for(i=1;i<n;++i){ | |
if(t[i]&&!t[i-1])sa[--x[s[i]]]=p[q[i]=nn++]=i; | |
} | |
); | |
for(i=0;i<n;++i)if((j=sa[i])&&t[j]&&!t[j-1]){ | |
neq=lst<0||memcmp(s+j,s+lst,(p[q[j]+1]-j)*sizeof(int)); | |
ns[q[lst=j]]=nmxz+=neq; | |
} | |
if(nmxz==nn-1)for(i=0;i<nn;++i)q[ns[i]]=i; | |
else sais(ns,q,p+nn,t+n,c+z,nn,nmxz+1); | |
MAGIC( | |
for(i=nn-1;i>=0;--i)sa[--x[s[p[q[i]]]]]=p[q[i]]; | |
); | |
} | |
#undef MAGIC | |
/*****************這些是需要用到的陣列大小**************/ | |
static const int MXN=10000000; | |
int s[MXN*2+5],sa[MXN*2+5],p[MXN+5],c[MXN*2+5]; | |
bool t[MXN*2+5]; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*SA-IS Algorithm*/ | |
const unsigned char mask[]={0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01}; | |
#define tget(i) ( (t[(i)/8]&mask[(i)%8])?1:0) | |
#define tset(i, b) t[(i)/8]=(b)?(mask[(i)%8]|t[(i)/8]):((~mask[(i)%8])&t[(i)/8]) | |
#define chr(i) (cs==sizeof(int)?((int*)s)[i]:((unsigned char *)s)[i]) | |
#define isLMS(i) (i>0&&tget(i)&&!tget(i-1)) | |
// find the start or end of each bucket | |
inline void getBuckets(unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
int i,sum=0; | |
// clear all buckets | |
for(i=0;i<=K;++i)bkt[i]=0; | |
// compute the size of each bucket | |
for(i=0;i<n;++i)++bkt[chr(i)]; | |
for(i=0;i<=K;++i){ | |
sum+=bkt[i]; | |
bkt[i]=end?sum:sum-bkt[i]; | |
} | |
} | |
// compute SAl | |
inline void induceSAl(unsigned char *t,int *SA,unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
int i,j; | |
// find starts of buckets | |
getBuckets(s,bkt,n,K,cs,end); | |
for(i=0;i<n;++i){ | |
j=SA[i]-1; | |
if(j>=0&&!tget(j))SA[bkt[chr(j)]++]=j; | |
} | |
} | |
// compute SAs | |
inline void induceSAs(unsigned char *t,int *SA,unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
int i,j; | |
// find ends of buckets | |
getBuckets(s,bkt,n,K,cs,end); | |
for(i=n-1;i>=0;--i){ | |
j=SA[i]-1; | |
if(j>=0&&tget(j))SA[--bkt[chr(j)]]=j; | |
} | |
} | |
// find the suffix array SA of s[0..n-1] in {1..K}?n | |
// require s[n-1]=0 (the sentinel!), n>=2 | |
// use a working space (excluding s and SA) of | |
// at most 2.25n+O(1) for a constant alphabet | |
void SA_IS(unsigned char *s,int *SA,int n,int K,short cs){ | |
// LS-type array in bits | |
unsigned char *t=(unsigned char *)malloc(n/8+1); | |
int i,j; | |
// classify the type of each character | |
// the sentinel must be in s1, important!!! | |
tset(n-2,0);tset(n-1,1); | |
for(i=n-3;i>=0;--i) | |
tset(i,(chr(i)<chr(i+1)||(chr(i)==chr(i+1)&&tget(i+1)==1))?1:0); | |
// stage 1: reduce the problem by at least 1/2 | |
// sort all the S-substrings | |
// bucket array | |
int *bkt=(int *)malloc(sizeof(int)*(K+1)); | |
// find ends of buckets | |
getBuckets(s,bkt,n,K,cs,true); | |
for(i=0;i<n;++i)SA[i]=-1; | |
for(i=1;i<n;++i)if(isLMS(i))SA[--bkt[chr(i)]]=i; | |
induceSAl(t,SA,s,bkt,n,K,cs,false); | |
induceSAs(t,SA,s,bkt,n,K,cs,true); | |
free(bkt); | |
// compact all the sorted substrings into | |
// the first n1 items of SA | |
// 2*n1 must be not larger than n (proveable) | |
int n1=0; | |
for(i=0;i<n;++i)if(isLMS(SA[i]))SA[n1++]=SA[i]; | |
// find the lexicographic names of substrings | |
// init the name array buffer | |
for(i=n1;i<n;++i)SA[i]=-1; | |
int name=0,prev=-1; | |
for(i=0;i<n1;++i){ | |
int pos=SA[i];bool diff=false; | |
for(int d=0;d<n;++d) | |
if(prev==-1||chr(pos+d)!=chr(prev+d)||tget(pos+d)!=tget(prev+d)){ | |
diff=true;break; | |
}else if(d>0&&(isLMS(pos+d)||isLMS(prev+d)))break; | |
if(diff){ | |
++name;prev=pos; | |
} | |
pos=(pos%2==0)?pos/2:(pos-1)/2; | |
SA[n1+pos]=name-1; | |
} | |
for(i=n-1,j=n-1;i>=n1;--i)if(SA[i]>=0)SA[j--]=SA[i]; | |
// stage 2: solve the reduced problem | |
// recurse if names are not yet unique | |
int *SA1=SA,*s1=SA+n-n1; | |
if(name<n1)SA_IS((unsigned char*)s1,SA1,n1,name-1,sizeof(int)); | |
else // generate the suffix array of s1 directly | |
for(i=0;i<n1;++i)SA1[s1[i]]=i; | |
// stage 3: induce the result for | |
// the original problem | |
// bucket array | |
bkt=(int *)malloc(sizeof(int)*(K+1)); | |
// put all the LMS characters into their buckets | |
// find ends of buckets | |
getBuckets(s,bkt,n,K,cs,true); | |
for(i=1,j=0;i<n;++i)if(isLMS(i))s1[j++]=i; // get p1 | |
// get index in s | |
for(i=0;i<n1;++i)SA1[i]=s1[SA1[i]]; | |
// init SA[n1..n-1] | |
for(i=n1;i<n;++i)SA[i]=-1; | |
for(i=n1-1;i>=0;--i){ | |
j=SA[i];SA[i]=-1; | |
SA[--bkt[chr(j)]]=j; | |
} | |
induceSAl(t,SA,s,bkt,n,K,cs,false); | |
induceSAs(t,SA,s,bkt,n,K,cs,true); | |
free(bkt);free(t); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <time.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#ifndef UCHAR_SIZE | |
# define UCHAR_SIZE 256 | |
#endif | |
#ifndef MINBUCKETSIZE | |
# define MINBUCKETSIZE 256 | |
#endif | |
#define sais_index_type int | |
#define sais_bool_type int | |
#define SAIS_LMSSORT2_LIMIT 0x3fffffff | |
#define SAIS_MYMALLOC(_num, _type) ((_type *)malloc((_num) * sizeof(_type))) | |
#define chr(_a) (cs == sizeof(sais_index_type) ? ((sais_index_type *)T)[(_a)] : ((unsigned char *)T)[(_a)]) | |
/* find the start or end of each bucket */ | |
inline void getCounts(const void *T,sais_index_type *C,sais_index_type n,sais_index_type k,int cs){ | |
sais_index_type i; | |
for(i=0;i<k;++i)C[i]=0; | |
for(i=0;i<n;++i)++C[chr(i)]; | |
} | |
inline void getBuckets(const sais_index_type *C,sais_index_type *B,sais_index_type k,sais_bool_type end){ | |
sais_index_type i,sum=0; | |
if(end)for(i=0;i<k;++i){ | |
sum+=C[i]; | |
B[i]=sum; | |
}else for(i=0;i<k;++i){ | |
sum+=C[i]; | |
B[i]=sum-C[i]; | |
} | |
} | |
/* sort all type LMS suffixes */ | |
inline void LMSsort1(const void *T,sais_index_type *SA,sais_index_type *C,sais_index_type *B,sais_index_type n,sais_index_type k,int cs){ | |
sais_index_type *b,i,j; | |
sais_index_type c0,c1; | |
/* compute SAl */ | |
if(C==B)getCounts(T,C,n,k,cs); | |
getBuckets(C,B,k,0); /* find starts of buckets */ | |
j=n-1; | |
b=SA+B[c1=chr(j)]; | |
--j; | |
*b++=(chr(j)<c1)?~j:j; | |
for(i=0;i<n;++i){ | |
if(0<(j=SA[i])){ | |
if((c0=chr(j))!=c1){ | |
B[c1]=b-SA; | |
b=SA+B[c1=c0]; | |
} | |
--j; | |
*b++=(chr(j)<c1)?~j:j; | |
SA[i]=0; | |
}else if(j<0)SA[i]=~j; | |
} | |
/* compute SAs */ | |
if(C==B)getCounts(T,C,n,k,cs); | |
getBuckets(C,B,k,1); /* find ends of buckets */ | |
for(i=n-1,b=SA+B[c1=0];0<=i;--i){ | |
if(0<(j=SA[i])){ | |
if((c0=chr(j))!=c1){ | |
B[c1]=b-SA; | |
b=SA+B[c1=c0]; | |
} | |
--j; | |
*--b=(chr(j)>c1)?~(j+1):j; | |
SA[i]=0; | |
} | |
} | |
} | |
inline sais_index_type LMSpostproc1(const void *T,sais_index_type *SA,sais_index_type n,sais_index_type m,int cs){ | |
sais_index_type i,j,p,q,plen,qlen,name; | |
sais_index_type c0,c1; | |
sais_bool_type diff; | |
/* compact all the sorted substrings into the first m items of SA | |
2*m must be not larger than n (proveable) */ | |
for(i=0;(p=SA[i])<0;++i)SA[i]=~p; | |
if(i<m){ | |
for(j=i,++i;;++i){ | |
if((p=SA[i])<0){ | |
SA[j++]=~p;SA[i]=0; | |
if(j==m)break; | |
} | |
} | |
} | |
/* store the length of all substrings */ | |
i=n-1;j=n-1;c0=chr(n-1); | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))>=c1)); | |
for(;0<=i;){ | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))<=c1)); | |
if(0<=i){ | |
SA[m+((i+1)>>1)]=j-i; | |
j=i+1; | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))>=c1)); | |
} | |
} | |
/* find the lexicographic names of all substrings */ | |
for(i=0,name=0,q=n,qlen=0;i<m;++i){ | |
p=SA[i],plen=SA[m+(p>>1)],diff=1; | |
if((plen==qlen)&&((q+plen)<n)){ | |
for(j=0;(j<plen)&&(chr(p+j)==chr(q+j));++j); | |
if(j==plen)diff=0; | |
} | |
if(diff!=0)++name,q=p,qlen=plen; | |
SA[m+(p>>1)]=name; | |
} | |
return name; | |
} | |
inline void LMSsort2(const void *T,sais_index_type *SA,sais_index_type *C,sais_index_type *B,sais_index_type *D,sais_index_type n,sais_index_type k,int cs){ | |
sais_index_type *b,i,j,t,d; | |
sais_index_type c0,c1; | |
/* compute SAl */ | |
getBuckets(C,B,k,0); /* find starts of buckets */ | |
j=n-1; | |
b=SA+B[c1=chr(j)]; | |
--j; | |
t=(chr(j)<c1); | |
j+=n; | |
*b++=(t&1)?~j:j; | |
for(i=0,d=0;i<n;++i){ | |
if(0<(j=SA[i])){ | |
if(n<=j)d+=1,j-=n; | |
if((c0=chr(j))!=c1){ | |
B[c1]=b-SA; | |
b=SA+B[c1=c0]; | |
} | |
--j; | |
t=c0;t=(t<<1)|(chr(j)<c1); | |
if(D[t]!=d)j+=n,D[t]=d; | |
*b++=(t&1)?~j:j; | |
SA[i]=0; | |
}else if(j<0)SA[i]=~j; | |
} | |
for(i=n-1;0<=i;--i){ | |
if(0<SA[i]){ | |
if(SA[i]<n){ | |
SA[i]+=n; | |
for(j=i-1;SA[j]<n;--j); | |
SA[j]-=n; | |
i=j; | |
} | |
} | |
} | |
/* compute SAs */ | |
getBuckets(C,B,k,1); /* find ends of buckets */ | |
for(i=n-1,d+=1,b=SA+B[c1=0];0<=i;--i){ | |
if(0<(j=SA[i])){ | |
if(n<=j)d+=1,j-=n; | |
if((c0=chr(j))!=c1){ | |
B[c1]=b-SA; | |
b=SA+B[c1=c0]; | |
} | |
--j; | |
t=c0;t=(t<<1)|(chr(j)>c1); | |
if(D[t]!=d)j+=n,D[t]=d; | |
*--b=(t&1)?~(j+1):j; | |
SA[i]=0; | |
} | |
} | |
} | |
inline sais_index_type LMSpostproc2(sais_index_type *SA,sais_index_type n,sais_index_type m){ | |
sais_index_type i,j,d,name; | |
/* compact all the sorted LMS substrings into the first m items of SA */ | |
for(i=0,name=0;(j=SA[i])<0;++i){ | |
j=~j; | |
if(n<=j)name+=1; | |
SA[i]=j; | |
} | |
if(i<m){ | |
for(d=i,++i;;++i){ | |
if((j=SA[i])<0){ | |
j=~j; | |
if(n<=j)name+=1; | |
SA[d++]=j;SA[i]=0; | |
if(d==m)break; | |
} | |
} | |
} | |
if(name<m){ | |
/* store the lexicographic names */ | |
for(i=m-1,d=name+1;0<=i;--i){ | |
if(n<=(j=SA[i]))j-=n,--d; | |
SA[m+(j>>1)]=d; | |
} | |
}else{ | |
/* unset flags */ | |
for(i=0;i<m;++i){ | |
if(n<=(j=SA[i]))j-=n,SA[i]=j; | |
} | |
} | |
return name; | |
} | |
/* compute SA and BWT */ | |
inline void induceSA(const void *T,sais_index_type *SA,sais_index_type *C,sais_index_type *B,sais_index_type n,sais_index_type k,int cs){ | |
sais_index_type *b,i,j; | |
sais_index_type c0,c1; | |
/* compute SAl */ | |
if(C==B)getCounts(T, C, n, k, cs); | |
getBuckets(C,B,k,0); /* find starts of buckets */ | |
j =n-1; | |
b=SA+B[c1=chr(j)]; | |
*b++=((0<j)&&(chr(j-1)<c1))?~j:j; | |
for(i=0;i<n;++i){ | |
j=SA[i],SA[i]=~j; | |
if(0<j){ | |
--j; | |
if((c0=chr(j))!=c1){ | |
B[c1]=b-SA; | |
b=SA+B[c1=c0]; | |
} | |
*b++=((0<j)&&(chr(j-1)<c1))?~j:j; | |
} | |
} | |
/* compute SAs */ | |
if(C==B)getCounts(T,C,n,k,cs); | |
getBuckets(C,B,k,1); /* find ends of buckets */ | |
for(i=n-1,b=SA+B[c1=0];0<=i;--i){ | |
if(0<(j=SA[i])){ | |
--j; | |
if((c0=chr(j))!=c1){ | |
B[c1]=b-SA; | |
b=SA+B[c1=c0]; | |
} | |
*--b=((j==0)||(chr(j-1)>c1))?~j:j; | |
}else SA[i]=~j; | |
} | |
} | |
inline sais_index_type computeBWT(const void *T,sais_index_type *SA,sais_index_type *C,sais_index_type *B,sais_index_type n,sais_index_type k,int cs){ | |
sais_index_type *b,i,j,pidx=-1; | |
sais_index_type c0,c1; | |
/* compute SAl */ | |
if(C==B)getCounts(T, C, n, k, cs); | |
getBuckets(C,B,k,0); /* find starts of buckets */ | |
j=n-1; | |
b=SA+B[c1=chr(j)]; | |
*b++=((0<j)&&(chr(j-1)<c1))?~j:j; | |
for(i=0;i<n;++i){ | |
if(0<(j=SA[i])){ | |
--j; | |
SA[i]=~((sais_index_type)(c0 = chr(j))); | |
if(c0!=c1){ | |
B[c1]=b-SA; | |
b=SA+B[c1=c0]; | |
} | |
*b++=((0<j)&&(chr(j-1)<c1))?~j:j; | |
}else if(j!=0)SA[i]=~j; | |
} | |
/* compute SAs */ | |
if(C==B)getCounts(T,C,n,k,cs); | |
getBuckets(C,B,k,1); /* find ends of buckets */ | |
for(i=n-1,b=SA+B[c1=0];0<=i;--i){ | |
if(0<(j=SA[i])){ | |
--j; | |
SA[i]=(c0=chr(j)); | |
if(c0!=c1){ | |
B[c1]=b-SA; | |
b=SA+B[c1=c0]; | |
} | |
*--b=((0<j)&&(chr(j-1)>c1))?~((sais_index_type)chr(j-1)):j; | |
}else if(j!=0)SA[i]=~j; | |
else pidx=i; | |
} | |
return pidx; | |
} | |
/* find the suffix array SA of T[0..n-1] in {0..255}^n */ | |
sais_index_type sais_main(const void *T,sais_index_type *SA,sais_index_type fs,sais_index_type n,sais_index_type k,int cs,sais_bool_type isbwt){ | |
sais_index_type *C,*B,*D,*RA,*b; | |
sais_index_type i,j,m,p,q,t,name,pidx=0,newfs; | |
sais_index_type c0,c1; | |
unsigned int flags; | |
if(k<=MINBUCKETSIZE){ | |
if((C=SAIS_MYMALLOC(k,sais_index_type))==NULL)return -2; | |
if(k<=fs){ | |
B=SA+(n+fs-k); | |
flags=1; | |
}else{ | |
if((B=SAIS_MYMALLOC(k,sais_index_type))==NULL){ | |
free(C); | |
return -2; | |
} | |
flags=3; | |
} | |
}else if(k<=fs){ | |
C=SA+(n+fs-k); | |
if(k<=(fs-k)){ | |
B=C-k; | |
flags=0; | |
}else if(k<=(MINBUCKETSIZE*4)){ | |
if((B=SAIS_MYMALLOC(k,sais_index_type))==NULL)return -2; | |
flags=2; | |
}else{ | |
B=C; | |
flags=8; | |
} | |
}else{ | |
if((C=B=SAIS_MYMALLOC(k,sais_index_type))==NULL)return -2; | |
flags=4|8; | |
} | |
if((n<=SAIS_LMSSORT2_LIMIT)&&(2<=(n/k))){ | |
if(flags&1)flags|=((k*2)<=(fs-k))?32:16; | |
else if((flags==0)&&((k*2)<=(fs-k*2)))flags|=32; | |
} | |
/* stage 1: reduce the problem by at least 1/2 | |
sort all the LMS-substrings */ | |
getCounts(T,C,n,k,cs);getBuckets(C,B,k,1); /* find ends of buckets */ | |
for(i=0;i<n;++i)SA[i]=0; | |
b=&t;i=n-1;j=n;m=0;c0=chr(n-1); | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))>=c1)); | |
for(;0<=i;){ | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))<=c1)); | |
if(0<=i){ | |
*b=j;b=SA+--B[c1];j=i;++m; | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))>=c1)); | |
} | |
} | |
if(1<m){ | |
if(flags&(16|32)){ | |
if(flags&16){ | |
if((D=SAIS_MYMALLOC(k*2,sais_index_type))==NULL){ | |
if(flags&(1|4))free(C); | |
if(flags&2)free(B); | |
return -2; | |
} | |
}else D=B-k*2; | |
++B[chr(j+1)]; | |
for(i=0,j=0;i<k;++i){ | |
j+=C[i]; | |
if(B[i]!=j)SA[B[i]]+=n; | |
D[i]=D[i+k]=0; | |
} | |
LMSsort2(T,SA,C,B,D,n,k,cs); | |
name=LMSpostproc2(SA,n,m); | |
if(flags&16)free(D); | |
}else{ | |
LMSsort1(T,SA,C,B,n,k,cs); | |
name=LMSpostproc1(T,SA,n,m,cs); | |
} | |
}else if(m==1){ | |
*b=j+1; | |
name=1; | |
}else name = 0; | |
/* stage 2: solve the reduced problem | |
recurse if names are not yet unique */ | |
if(name<m){ | |
if(flags&4)free(C); | |
if(flags&2)free(B); | |
newfs=(n+fs)-(m*2); | |
if((flags&(1|4|8))==0){ | |
if((k+name)<=newfs)newfs-=k; | |
else flags|=8; | |
} | |
RA=SA+m+newfs; | |
for(i=m+(n>>1)-1,j=m-1;m<=i;--i){ | |
if(SA[i]!=0)RA[j--]=SA[i]-1; | |
} | |
if(sais_main(RA,SA,newfs,m,name,sizeof(sais_index_type),0)!=0){ | |
if(flags&1)free(C); | |
return -2; | |
} | |
i=n-1;j=m-1;c0=chr(n-1); | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))>=c1)); | |
for(;0<=i;){ | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))<=c1)); | |
if(0<=i){ | |
RA[j--]=i+1; | |
do{ | |
c1=c0; | |
}while((0<=--i)&&((c0=chr(i))>=c1)); | |
} | |
} | |
for(i=0;i<m;++i)SA[i]=RA[SA[i]]; | |
if(flags&4){ | |
if((C=B=SAIS_MYMALLOC(k,int))==NULL)return -2; | |
} | |
if(flags&2){ | |
if((B=SAIS_MYMALLOC(k,int))==NULL){ | |
if(flags&1)free(C); | |
return -2; | |
} | |
} | |
} | |
/* stage 3: induce the result for the original problem */ | |
if(flags&8)getCounts(T,C,n,k,cs); | |
/* put all left-most S characters into their buckets */ | |
if(1<m){ | |
getBuckets(C,B,k,1); /* find ends of buckets */ | |
i=m-1,j=n,p=SA[m-1],c1=chr(p); | |
do{ | |
q=B[c0=c1]; | |
while(q<j)SA[--j]=0; | |
do{ | |
SA[--j]=p; | |
if(--i<0)break; | |
p=SA[i]; | |
}while((c1=chr(p))==c0); | |
}while(0<=i); | |
while(0<j)SA[--j]=0; | |
} | |
if(isbwt==0)induceSA(T,SA,C,B,n,k,cs); | |
else pidx=computeBWT(T,SA,C,B,n,k,cs); | |
if(flags&(1|4))free(C); | |
if(flags&2)free(B); | |
return pidx; | |
} | |
/*---------------------------------------------------------------------------*/ | |
inline int sais(const unsigned char *T,int *SA,int n){ | |
if((T==NULL)||(SA==NULL)||(n<0))return -1; | |
if(n<=1){ | |
if(n==1)SA[0]=0; | |
return 0; | |
} | |
return sais_main(T,SA,0,n,UCHAR_SIZE,sizeof(unsigned char),0); | |
} | |
inline int sais_int(const int *T,int *SA,int n,int k){ | |
if((T==NULL)||(SA==NULL)||(n<0)||(k<=0))return -1; | |
if(n<=1){ | |
if(n==1)SA[0]=0; | |
return 0; | |
} | |
return sais_main(T,SA,0,n,k,sizeof(int),0); | |
} | |
inline int sais_bwt(const unsigned char *T,unsigned char *U,int *A,int n){ | |
int i,pidx; | |
if((T==NULL)||(U==NULL)||(A==NULL)||(n<0))return -1; | |
if(n<=1){ | |
if(n==1)U[0]=T[0]; | |
return n; | |
} | |
pidx=sais_main(T,A,0,n,UCHAR_SIZE,sizeof(unsigned char),1); | |
if(pidx<0)return pidx; | |
U[0]=T[n-1]; | |
for(i=0;i<pidx;++i)U[i+1]=(unsigned char)A[i]; | |
for(i+=1;i<n;++i)U[i]=(unsigned char)A[i]; | |
pidx+=1; | |
return pidx; | |
} | |
inline int sais_int_bwt(const int *T,int *U,int *A,int n,int k){ | |
int i,pidx; | |
if((T==NULL)||(U==NULL)||(A==NULL)||(n<0)||(k<=0))return -1; | |
if(n<=1){ | |
if(n==1)U[0]=T[0]; | |
return n; | |
} | |
pidx=sais_main(T,A,0,n,k,sizeof(int),1); | |
if(pidx<0)return pidx; | |
U[0]=T[n-1]; | |
for(i=0;i<pidx;++i)U[i+1]=A[i]; | |
for(i+=1;i<n;++i)U[i]=A[i]; | |
pidx+=1; | |
return pidx; | |
} | |
char s[10000005]; | |
int sa[10000005],len; | |
int main(){ | |
freopen("in.txt","r",stdin); | |
gets(s); | |
int st=clock(); | |
sais((unsigned char*)s,sa,len=strlen(s)); | |
printf("%f\n",(double)(clock()-st)/1000); | |
//for(int i=0;i<len;++i)printf("%d\n",sa[i]); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define pushS(x) sa[--bkt[s[x]]] = x | |
#define pushL(x) sa[bkt[s[x]]++] = x | |
#define induce_sort(v){\ | |
fill_n(sa,n,0);\ | |
copy_n(_bkt,A,bkt);\ | |
for(i=n1-1;~i;--i)pushS(v[i]);\ | |
copy_n(_bkt,A-1,bkt+1);\ | |
for(i=0;i<n;++i)if(sa[i]&&!t[sa[i]-1])pushL(sa[i]-1);\ | |
copy_n(_bkt,A,bkt);\ | |
for(i=n-1;~i;--i)if(sa[i]&&t[sa[i]-1])pushS(sa[i]-1);\ | |
} | |
template<typename T> | |
void sais(const T s,int n,int *sa,int *_bkt,int *p,bool *t,int A){ | |
int *rnk=p+n,*s1=p+n/2,*bkt=_bkt+A; | |
int n1=0,i,j,x=t[n-1]=1,y=rnk[0]=-1,cnt=-1; | |
for(i=n-2;~i;--i)t[i]=(s[i]==s[i+1]?t[i+1]:s[i]<s[i+1]); | |
for(i=1;i<n;++i)rnk[i]=t[i]&&!t[i-1]?(p[n1]=i,n1++):-1; | |
fill_n(_bkt,A,0); | |
for(i=0;i<n;++i)++_bkt[s[i]]; | |
for(i=1;i<A;++i)_bkt[i]+=_bkt[i-1]; | |
induce_sort(p); | |
for(i=0;i<n;++i)if(~(x=rnk[sa[i]])) | |
j=y<0||memcmp(s+p[x],s+p[y],(p[x+1]-p[x])*sizeof(s[0])) | |
,s1[y=x]=cnt+=j; | |
if(cnt+1<n1)sais(s1,n1,sa,bkt,rnk,t+n,cnt+1); | |
else for(i=0;i<n1;++i)sa[s1[i]]=i; | |
for(i=0;i<n1;++i)s1[i]=p[sa[i]]; | |
induce_sort(s1); | |
} | |
/*****************這些是需要用到的陣列大小**************/ | |
const int MAXN=10000005,MAXA='z'+1; | |
int sa[MAXN],bkt[MAXN+MAXA],p[MAXN*2]; | |
bool t[MAXN*2]; | |
char s[MAXN]; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In SA-IS very fast.cpp, i don't get var flags stand for, can you explain?