Last active
October 11, 2025 05:24
-
-
Save jacky860226/1d33adad858eef71bfe18120d8d69e6d to your computer and use it in GitHub Desktop.
Suffix Array + Longest Common Prefix
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * DC3 (Difference Cover 3) 後綴陣列構造算法 | |
| * | |
| * 算法原理: | |
| * 1. 將所有後綴分為三類:Type A (i%3==0), Type B1 (i%3==1), Type B2 (i%3==2) | |
| * 2. 遞歸處理 Type B 後綴(佔總數的 2/3) | |
| * 3. 利用 Type B 的排序結果來排序 Type A 後綴 | |
| * 4. 歸併 Type A 和 Type B 得到完整的後綴陣列 | |
| * | |
| * 時間複雜度分析: | |
| * T(n, σ) = T(⌊2n/3⌋, O(n)) + O(n + σ) | |
| * 其中: | |
| * - n = string_length(字符串長度) | |
| * - σ = alphabet_size(字母表大小) | |
| * | |
| * 每層遞歸的線性操作包括: | |
| * - 計數排序:O(n + σ),需要處理 σ 個不同字符 | |
| * - 三字符比較和分類:O(n) | |
| * - 歸併操作:O(n) | |
| * | |
| * 遞歸深度:O(log_{3/2} n) | |
| * 總時間複雜度:O(n + σ) 當 σ = O(n) 時為 O(n) | |
| * | |
| * 空間複雜度:O(n),遞歸棧空間 O(log n) | |
| * | |
| * @param s 輸入字符串 | |
| * @return 後綴陣列,sa[i] 表示字典序第 i 小的後綴在原串中的起始位置 | |
| */ | |
| vector<int> suffix_array_dc3(const string &s) { | |
| if (s.empty()) | |
| return {}; | |
| // 計算所需存儲空間 | |
| // 遞歸關係 | |
| // M(n) = (n + δ(n) + 2) , n ≤ 2 | |
| // (n + δ(n) + 2) + M(⌊2(n + δ(n))/3⌋), n > 2 | |
| // 其中 δ(n) = 1 if n%3==1, else 0 | |
| // 解得:M(n) <= 3n + 4log_{3/2}(n-2) + 1 ≈ 3n + O(log n) | |
| int storage_size = [](int n) { | |
| if (n <= 2) | |
| return n + 3; | |
| return 3 * n + int(4 * log(n - 2) / log(1.5)) + 1; | |
| }(s.size()); | |
| // 工作數組分配 | |
| vector<int> suffix_array_v(storage_size, 0), str_v(storage_size, 0); | |
| vector<int> bucket(max<int>(256, s.size() + 1), 0), | |
| temp_a(2 * (s.size() + 1) / 3, 0), temp_b(2 * (s.size() + 1) / 3, 0); | |
| /** | |
| * 計數排序 (Counting Sort) - DC3算法的核心排序組件 | |
| * | |
| * 時間複雜度:O(length + max_val) = O(n + σ) | |
| * - length 通常為 O(n),表示待排序的後綴數量 | |
| * - max_val 為字母表大小 σ,影響計數排序的效率 | |
| * - 當 σ = O(n) 時,整體複雜度為 O(n) | |
| * - 當 σ >> n 時,複雜度受 σ 主導 | |
| * | |
| * 空間複雜度:O(σ) 用於 bucket 數組 | |
| * | |
| * @param str 字符數組,用於獲取排序鍵值 | |
| * @param input_idx 待排序的索引數組 | |
| * @param sorted_idx 排序結果存放數組 | |
| * @param length 待排序元素的數量 | |
| * @param max_val 最大字符值(字母表大小-1) | |
| */ | |
| auto counting_sort = [&](int *str, auto &input_idx, auto &sorted_idx, | |
| int length, int max_val) { | |
| // 清空計數桶,時間複雜度 O(σ) | |
| for (int i = 0; i <= max_val; ++i) | |
| bucket[i] = 0; | |
| // 統計每個字符的出現次數,時間複雜度 O(n) | |
| for (int i = 0; i < length; ++i) | |
| bucket[str[input_idx[i]]]++; | |
| // 計算累積計數,確定排序位置,時間複雜度 O(σ) | |
| for (int i = 1; i <= max_val; ++i) | |
| bucket[i] += bucket[i - 1]; | |
| // 從後向前穩定排序,時間複雜度 O(n) | |
| for (int i = length - 1; i >= 0; --i) | |
| sorted_idx[--bucket[str[input_idx[i]]]] = input_idx[i]; | |
| }; | |
| /** | |
| * 三字符比較函數 | |
| * 用於判斷兩個位置的三字符前綴是否完全相同 | |
| * 時間複雜度:O(1) | |
| */ | |
| auto compare_three_chars = [&](int *str, int pos_x, int pos_y) { | |
| return str[pos_x] == str[pos_y] && str[pos_x + 1] == str[pos_y + 1] && | |
| str[pos_x + 2] == str[pos_y + 2]; | |
| }; | |
| /** | |
| * DC3 主遞歸函數 | |
| * | |
| * 時間複雜度詳細分析: | |
| * T(n, σ) = T(⌊2n/3⌋, O(n)) + O(n + σ) | |
| * | |
| * 線性部分 O(n + σ) 包括: | |
| * 1. Type B 後綴收集與三輪計數排序:3 × O(n + σ) = O(n + σ) | |
| * 2. 重新編號和去重檢查:O(n) | |
| * 3. Type A 後綴處理:O(n + σ) | |
| * 4. 最終歸併:O(n) | |
| * | |
| * 遞歸部分:問題規模縮減到 ⌊2n/3⌋,字母表大小變為 O(n) | |
| * | |
| * 總體複雜度: | |
| * - 當 σ = O(1):T(n) = O(n) | |
| * - 當 σ = O(n):T(n) = O(n) | |
| * - 當 σ > n:T(n) = O(σ) | |
| * | |
| * @param str 數值化的字符數組 | |
| * @param sa 輸出的後綴數組 | |
| * @param string_length 字符串長度 | |
| * @param alphabet_size 字母表大小 | |
| */ | |
| std::function<void(int *, int *, int, int)> DC3 = [&](int *str, int *sa, | |
| int string_length, | |
| int alphabet_size) { | |
| // 長度模3等於1時需要填充,確保遞歸時 Type B1 和 B2 後綴間有隔離字符 | |
| bool needs_padding = (string_length % 3 == 1); | |
| if (needs_padding) | |
| str[string_length++] = 0; | |
| // 設置工作指針,利用現有空間避免額外分配 | |
| int *new_ranks = str + string_length + 2, // 新排名數組 | |
| *new_sa = sa + string_length, // 遞歸結果數組 | |
| Num12 = 0, current_rank; // Type B 後綴計數 | |
| // === 第一階段:收集並排序 Type B 後綴 (i%3 != 0) === | |
| // Type B 後綴占總數的 2/3,是遞歸處理的對象 | |
| for (int i = 0; i < string_length; ++i) | |
| if (i % 3) | |
| temp_a[Num12++] = i; | |
| // 添加哨兵,確保邊界安全 | |
| str[string_length] = str[string_length + 1] = 0; | |
| // 三輪計數排序:按三字符前綴 (str[i], str[i+1], str[i+2]) 排序 | |
| // 每輪排序時間複雜度:O(n + σ) | |
| counting_sort(str + 2, temp_a, temp_b, Num12, alphabet_size); // 第三字符 | |
| counting_sort(str + 1, temp_b, temp_a, Num12, alphabet_size); // 第二字符 | |
| counting_sort(str, temp_a, temp_b, Num12, alphabet_size); // 第一字符 | |
| // 計算各類型後綴的數量 | |
| int Num0 = 0, Num1 = (string_length + 1) / 3; // Type B1 後綴數量 | |
| // 位置映射函數:將原始位置映射到遞歸數組中的位置 | |
| // Type B1 (i%3==1) 映射到前半部分 [0, Num1) | |
| // Type B2 (i%3==2) 映射到後半部分 [Num1, Num12) | |
| auto POSITION_MAPPING = [&](int x) { | |
| return (x / 3) + (x % 3 == 1 ? 0 : Num1); | |
| }; | |
| // === 第二階段:重新編號 Type B 後綴 === | |
| // 為相同的三字符前綴分配相同的排名 | |
| new_ranks[POSITION_MAPPING(temp_b[0])] = current_rank = 1; | |
| for (int i = 1; i < Num12; ++i) { | |
| // 如果三字符前綴不同,增加排名 | |
| if (!compare_three_chars(str, temp_b[i], temp_b[i - 1])) | |
| current_rank++; | |
| new_ranks[POSITION_MAPPING(temp_b[i])] = current_rank; | |
| } | |
| // === 第三階段:遞歸處理或直接構造 SA === | |
| if (current_rank < Num12) { | |
| // 仍有重複的三字符前綴,需要遞歸處理 | |
| // 遞歸調用:T(⌊2n/3⌋, current_rank) | |
| DC3(new_ranks, new_sa, Num12, current_rank); | |
| } else { | |
| // 所有三字符前綴都不同,直接構造後綴數組 | |
| for (int i = 0; i < Num12; ++i) | |
| if (new_ranks[i]) | |
| new_sa[new_ranks[i] - 1] = i; | |
| } | |
| // === 第四階段:處理 Type A 後綴 (i%3 == 0) === | |
| // 從 Type B 的排序結果中提取對應的 Type A 後綴 | |
| for (int i = 0; i < Num12; ++i) | |
| if (new_sa[i] < Num1) | |
| temp_b[Num0++] = new_sa[i] * 3; | |
| // 對 Type A 後綴按首字符排序,時間複雜度 O(n + σ) | |
| counting_sort(str, temp_b, temp_a, Num0, alphabet_size); | |
| // 逆向位置映射:從遞歸數組位置映射回原始位置 | |
| auto ORIGINAL_POSITION = [&](int x) { | |
| return x >= Num1 ? (x - Num1) * 3 + 2 : x * 3 + 1; | |
| }; | |
| // 建立 Type B 後綴的排名查找表 | |
| // new_ranks[position] = rank,用於常數時間比較 | |
| new_ranks = bucket.data(); | |
| for (int i = 0; i < Num12; ++i) | |
| new_ranks[temp_b[i] = ORIGINAL_POSITION(new_sa[i])] = i; | |
| new_ranks[string_length] = -1; // 哨兵 | |
| // === 第五階段:歸併 Type A 和 Type B 的排序結果 === | |
| /** | |
| * 後綴比較函數 - DC3算法的核心比較邏輯 | |
| * | |
| * 比較策略: | |
| * 1. 保證參數順序:pos_x 為 Type B,pos_y 為 Type A | |
| * 2. 首字符不同:直接比較字典序 | |
| * 3. Type B1 後綴 (pos % 3 == 1):比較遞歸排名 | |
| * 4. Type B2 後綴 (pos % 3 == 2):遞歸比較下一位 | |
| * | |
| * 性能優化: | |
| * - 計算 pos_x % 3 只執行一次,避免重複計算 | |
| * - 利用已計算的排名信息,避免字符串比較 | |
| * - 確保比較操作的常數時間複雜度 | |
| * | |
| * 時間複雜度:O(1) 均攤,這是DC3算法達到線性時間的關鍵 | |
| */ | |
| std::function<bool(int, int)> suffix_comparator = [&](int pos_x, | |
| int pos_y) { | |
| // 優化:計算模運算只一次,避免重複計算 | |
| int mod3 = pos_x % 3; | |
| // 保證 pos_x 是 Type B 後綴,pos_y 是 Type A 後綴 | |
| if (mod3 == 0) | |
| return !suffix_comparator(pos_y, pos_x); | |
| // 首字符不同,直接比較 | |
| if (str[pos_x] != str[pos_y]) | |
| return str[pos_x] < str[pos_y]; | |
| if (mod3 == 1) | |
| // Type B1: 比較下一位的遞歸排名 | |
| return new_ranks[pos_x + 1] < new_ranks[pos_y + 1]; | |
| else | |
| // Type B2: 遞歸比較,交換參數順序實現正確的字典序 | |
| return !suffix_comparator(pos_y + 1, pos_x + 1); | |
| }; | |
| // 使用 std::merge 歸併兩個已排序的後綴類型 | |
| // 注意:needs_padding 用於跳過隔離字符 | |
| // 當 needs_padding=true 時,隔離字符 \0 必然排在 temp_b 的第一位 | |
| // 因為 \0 是最小值,所以歸併時從 temp_b[1] 開始,跳過這個隔離字符 | |
| merge(temp_a.begin(), temp_a.begin() + Num0, // Type A 範圍 | |
| temp_b.begin() + needs_padding, temp_b.begin() + Num12, // Type B 範圍 | |
| sa, // 輸出位置 | |
| suffix_comparator); // 比較函數 | |
| }; | |
| // === 算法執行階段 === | |
| // 將字符串轉換為數值數組,便於處理 | |
| for (size_t i = 0; i < s.size(); ++i) | |
| str_v[i] = s[i]; | |
| // 調用主算法:DC3(字符數組, 後綴數組, 長度, 字母表大小) | |
| // 字母表大小設為 255,支持所有 ASCII 字符 | |
| DC3(str_v.data(), suffix_array_v.data(), s.size(), 255); | |
| // 調整結果數組大小並返回 | |
| suffix_array_v.resize(s.size()); | |
| return suffix_array_v; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include<algorithm> | |
| #define radix_sort(x,y){\ | |
| for(i=0;i<A;++i)c[i]=0;\ | |
| for(i=0;i<len;++i)c[x[y[i]]]++;\ | |
| for(i=1;i<A;++i)c[i]+=c[i-1];\ | |
| for(i=len-1;i>=0;--i)sa[--c[x[y[i]]]]=y[i];\ | |
| } | |
| #define sac(r,a,b) r[a]!=r[b]||a+k>=len||r[a+k]!=r[b+k] | |
| inline void suffix_array(const char *s,int len,int *sa,int *rank,int *tmp,int *c){ | |
| int A='z'+1,i,k,id=0; | |
| for(i=0;i<len;++i)rank[tmp[i]=i]=s[i]; | |
| radix_sort(rank,tmp); | |
| for(k=1;id<len-1;k<<=1){ | |
| for(id=0,i=len-k;i<len;++i)tmp[id++]=i; | |
| for(i=0;i<len;++i)if(sa[i]>=k)tmp[id++]=sa[i]-k; | |
| radix_sort(rank,tmp); | |
| std::swap(rank,tmp); | |
| for(rank[sa[0]]=id=0,i=1;i<len;++i) | |
| rank[sa[i]]=id+=sac(tmp,sa[i-1],sa[i]); | |
| A=id+1; | |
| } | |
| } | |
| #undef radix_sort | |
| #undef sac |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include<algorithm> | |
| struct CMP{ | |
| int len,k,*rank,a,b; | |
| inline bool operator()(int i,int j){ | |
| if(rank[i]!=rank[j])return rank[i]<rank[j]; | |
| a=(i+=k)<len?rank[i]:-1; | |
| b=(j+=k)<len?rank[j]:-1; | |
| return a<b; | |
| } | |
| }; | |
| inline void suffix_array(const char *s,int len,int *sa,int *rank,int *tmp){ | |
| for(int i=0;i<len;++i){ | |
| sa[i]=i; | |
| rank[i]=s[i]; | |
| } | |
| CMP cmp={len,1}; | |
| for(;;cmp.k<<=1){ | |
| cmp.rank=rank; | |
| std::sort(sa,sa+len,cmp); | |
| tmp[sa[0]]=0; | |
| for(int i=1;i<len;++i)tmp[sa[i]]=tmp[sa[i-1]]+cmp(sa[i-1],sa[i]); | |
| if(tmp[sa[len-1]]==len-1)break; | |
| std::swap(rank,tmp); | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| h:高度數組 | |
| sa:後綴數組 | |
| rank:排名 | |
| */ | |
| inline void suffix_array_lcp(const char *s,int len,int *h,int *sa,int *rank){ | |
| for(int i=0;i<len;++i)rank[sa[i]]=i; | |
| for(int i=0,k=0;i<len;++i){ | |
| if(rank[i]==0)continue; | |
| if(k)--k; | |
| while(s[i+k]==s[sa[rank[i]-1]+k])++k; | |
| h[rank[i]]=k; | |
| } | |
| h[0]=0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include<string.h> | |
| #define MAGIC(XD){\ | |
| memset(sa,0,sizeof(int)*n);\ | |
| memcpy(x,c,sizeof(int)*z);\ | |
| XD\ | |
| memcpy(x+1,c,sizeof(int)*(z-1));\ | |
| for(i=0;i<n;++i){\ | |
| if(sa[i]&&!t[sa[i]-1])sa[x[s[sa[i]-1]]++]=sa[i]-1;\ | |
| }\ | |
| memcpy(x,c,sizeof(int)*z);\ | |
| for(i=n-1;i>=0;--i){\ | |
| if(sa[i]&&t[sa[i]-1])sa[--x[s[sa[i]-1]]]=sa[i]-1;\ | |
| }\ | |
| } | |
| void sais(int *s,int *sa,int *p,bool *t,int *c,int n,int z){ | |
| bool neq=t[n-1]=1; | |
| int nn=0,nmxz=-1,*q=sa+n,*ns=s+n,*x=c+z,lst=-1,i,j; | |
| memset(c,0,sizeof(int)*z); | |
| for(i=0;i<n;++i)++c[s[i]]; | |
| for(i=0;i<z-1;++i)c[i+1]+=c[i]; | |
| for(i=n-2;i>=0;i--)t[i]=(s[i]==s[i+1]?t[i+1]:s[i]<s[i+1]); | |
| MAGIC( | |
| for(i=1;i<n;++i){ | |
| if(t[i]&&!t[i-1])sa[--x[s[i]]]=p[q[i]=nn++]=i; | |
| } | |
| ); | |
| for(i=0;i<n;++i)if((j=sa[i])&&t[j]&&!t[j-1]){ | |
| neq=lst<0||memcmp(s+j,s+lst,(p[q[j]+1]-j)*sizeof(int)); | |
| ns[q[lst=j]]=nmxz+=neq; | |
| } | |
| if(nmxz==nn-1)for(i=0;i<nn;++i)q[ns[i]]=i; | |
| else sais(ns,q,p+nn,t+n,c+z,nn,nmxz+1); | |
| MAGIC( | |
| for(i=nn-1;i>=0;--i)sa[--x[s[p[q[i]]]]]=p[q[i]]; | |
| ); | |
| } | |
| #undef MAGIC | |
| /*****************這些是需要用到的陣列大小**************/ | |
| static const int MXN=10000000; | |
| int s[MXN*2+5],sa[MXN*2+5],p[MXN+5],c[MXN*2+5]; | |
| bool t[MXN*2+5]; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /*SA-IS Algorithm*/ | |
| const unsigned char mask[]={0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01}; | |
| #define tget(i) ( (t[(i)/8]&mask[(i)%8])?1:0) | |
| #define tset(i, b) t[(i)/8]=(b)?(mask[(i)%8]|t[(i)/8]):((~mask[(i)%8])&t[(i)/8]) | |
| #define chr(i) (cs==sizeof(int)?((int*)s)[i]:((unsigned char *)s)[i]) | |
| #define isLMS(i) (i>0&&tget(i)&&!tget(i-1)) | |
| // find the start or end of each bucket | |
| inline void getBuckets(unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
| int i,sum=0; | |
| // clear all buckets | |
| for(i=0;i<=K;++i)bkt[i]=0; | |
| // compute the size of each bucket | |
| for(i=0;i<n;++i)++bkt[chr(i)]; | |
| for(i=0;i<=K;++i){ | |
| sum+=bkt[i]; | |
| bkt[i]=end?sum:sum-bkt[i]; | |
| } | |
| } | |
| // compute SAl | |
| inline void induceSAl(unsigned char *t,int *SA,unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
| int i,j; | |
| // find starts of buckets | |
| getBuckets(s,bkt,n,K,cs,end); | |
| for(i=0;i<n;++i){ | |
| j=SA[i]-1; | |
| if(j>=0&&!tget(j))SA[bkt[chr(j)]++]=j; | |
| } | |
| } | |
| // compute SAs | |
| inline void induceSAs(unsigned char *t,int *SA,unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
| int i,j; | |
| // find ends of buckets | |
| getBuckets(s,bkt,n,K,cs,end); | |
| for(i=n-1;i>=0;--i){ | |
| j=SA[i]-1; | |
| if(j>=0&&tget(j))SA[--bkt[chr(j)]]=j; | |
| } | |
| } | |
| // find the suffix array SA of s[0..n-1] in {1..K}?n | |
| // require s[n-1]=0 (the sentinel!), n>=2 | |
| // use a working space (excluding s and SA) of | |
| // at most 2.25n+O(1) for a constant alphabet | |
| void SA_IS(unsigned char *s,int *SA,int n,int K,short cs){ | |
| // LS-type array in bits | |
| unsigned char *t=(unsigned char *)malloc(n/8+1); | |
| int i,j; | |
| // classify the type of each character | |
| // the sentinel must be in s1, important!!! | |
| tset(n-2,0);tset(n-1,1); | |
| for(i=n-3;i>=0;--i) | |
| tset(i,(chr(i)<chr(i+1)||(chr(i)==chr(i+1)&&tget(i+1)==1))?1:0); | |
| // stage 1: reduce the problem by at least 1/2 | |
| // sort all the S-substrings | |
| // bucket array | |
| int *bkt=(int *)malloc(sizeof(int)*(K+1)); | |
| // find ends of buckets | |
| getBuckets(s,bkt,n,K,cs,true); | |
| for(i=0;i<n;++i)SA[i]=-1; | |
| for(i=1;i<n;++i)if(isLMS(i))SA[--bkt[chr(i)]]=i; | |
| induceSAl(t,SA,s,bkt,n,K,cs,false); | |
| induceSAs(t,SA,s,bkt,n,K,cs,true); | |
| free(bkt); | |
| // compact all the sorted substrings into | |
| // the first n1 items of SA | |
| // 2*n1 must be not larger than n (proveable) | |
| int n1=0; | |
| for(i=0;i<n;++i)if(isLMS(SA[i]))SA[n1++]=SA[i]; | |
| // find the lexicographic names of substrings | |
| // init the name array buffer | |
| for(i=n1;i<n;++i)SA[i]=-1; | |
| int name=0,prev=-1; | |
| for(i=0;i<n1;++i){ | |
| int pos=SA[i];bool diff=false; | |
| for(int d=0;d<n;++d) | |
| if(prev==-1||chr(pos+d)!=chr(prev+d)||tget(pos+d)!=tget(prev+d)){ | |
| diff=true;break; | |
| }else if(d>0&&(isLMS(pos+d)||isLMS(prev+d)))break; | |
| if(diff){ | |
| ++name;prev=pos; | |
| } | |
| pos=(pos%2==0)?pos/2:(pos-1)/2; | |
| SA[n1+pos]=name-1; | |
| } | |
| for(i=n-1,j=n-1;i>=n1;--i)if(SA[i]>=0)SA[j--]=SA[i]; | |
| // stage 2: solve the reduced problem | |
| // recurse if names are not yet unique | |
| int *SA1=SA,*s1=SA+n-n1; | |
| if(name<n1)SA_IS((unsigned char*)s1,SA1,n1,name-1,sizeof(int)); | |
| else // generate the suffix array of s1 directly | |
| for(i=0;i<n1;++i)SA1[s1[i]]=i; | |
| // stage 3: induce the result for | |
| // the original problem | |
| // bucket array | |
| bkt=(int *)malloc(sizeof(int)*(K+1)); | |
| // put all the LMS characters into their buckets | |
| // find ends of buckets | |
| getBuckets(s,bkt,n,K,cs,true); | |
| for(i=1,j=0;i<n;++i)if(isLMS(i))s1[j++]=i; // get p1 | |
| // get index in s | |
| for(i=0;i<n1;++i)SA1[i]=s1[SA1[i]]; | |
| // init SA[n1..n-1] | |
| for(i=n1;i<n;++i)SA[i]=-1; | |
| for(i=n1-1;i>=0;--i){ | |
| j=SA[i];SA[i]=-1; | |
| SA[--bkt[chr(j)]]=j; | |
| } | |
| induceSAl(t,SA,s,bkt,n,K,cs,false); | |
| induceSAs(t,SA,s,bkt,n,K,cs,true); | |
| free(bkt);free(t); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <time.h> | |
| #include <string.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #ifndef UCHAR_SIZE | |
| # define UCHAR_SIZE 256 | |
| #endif | |
| #ifndef MINBUCKETSIZE | |
| # define MINBUCKETSIZE 256 | |
| #endif | |
| #define sais_index_type int | |
| #define sais_bool_type int | |
| #define SAIS_LMSSORT2_LIMIT 0x3fffffff | |
| #define SAIS_MYMALLOC(_num, _type) ((_type *)malloc((_num) * sizeof(_type))) | |
| #define chr(_a) (cs == sizeof(sais_index_type) ? ((sais_index_type *)T)[(_a)] : ((unsigned char *)T)[(_a)]) | |
| /* find the start or end of each bucket */ | |
| inline void getCounts(const void *T,sais_index_type *C,sais_index_type n,sais_index_type k,int cs){ | |
| sais_index_type i; | |
| for(i=0;i<k;++i)C[i]=0; | |
| for(i=0;i<n;++i)++C[chr(i)]; | |
| } | |
| inline void getBuckets(const sais_index_type *C,sais_index_type *B,sais_index_type k,sais_bool_type end){ | |
| sais_index_type i,sum=0; | |
| if(end)for(i=0;i<k;++i){ | |
| sum+=C[i]; | |
| B[i]=sum; | |
| }else for(i=0;i<k;++i){ | |
| sum+=C[i]; | |
| B[i]=sum-C[i]; | |
| } | |
| } | |
| /* sort all type LMS suffixes */ | |
| inline void LMSsort1(const void *T,sais_index_type *SA,sais_index_type *C,sais_index_type *B,sais_index_type n,sais_index_type k,int cs){ | |
| sais_index_type *b,i,j; | |
| sais_index_type c0,c1; | |
| /* compute SAl */ | |
| if(C==B)getCounts(T,C,n,k,cs); | |
| getBuckets(C,B,k,0); /* find starts of buckets */ | |
| j=n-1; | |
| b=SA+B[c1=chr(j)]; | |
| --j; | |
| *b++=(chr(j)<c1)?~j:j; | |
| for(i=0;i<n;++i){ | |
| if(0<(j=SA[i])){ | |
| if((c0=chr(j))!=c1){ | |
| B[c1]=b-SA; | |
| b=SA+B[c1=c0]; | |
| } | |
| --j; | |
| *b++=(chr(j)<c1)?~j:j; | |
| SA[i]=0; | |
| }else if(j<0)SA[i]=~j; | |
| } | |
| /* compute SAs */ | |
| if(C==B)getCounts(T,C,n,k,cs); | |
| getBuckets(C,B,k,1); /* find ends of buckets */ | |
| for(i=n-1,b=SA+B[c1=0];0<=i;--i){ | |
| if(0<(j=SA[i])){ | |
| if((c0=chr(j))!=c1){ | |
| B[c1]=b-SA; | |
| b=SA+B[c1=c0]; | |
| } | |
| --j; | |
| *--b=(chr(j)>c1)?~(j+1):j; | |
| SA[i]=0; | |
| } | |
| } | |
| } | |
| inline sais_index_type LMSpostproc1(const void *T,sais_index_type *SA,sais_index_type n,sais_index_type m,int cs){ | |
| sais_index_type i,j,p,q,plen,qlen,name; | |
| sais_index_type c0,c1; | |
| sais_bool_type diff; | |
| /* compact all the sorted substrings into the first m items of SA | |
| 2*m must be not larger than n (proveable) */ | |
| for(i=0;(p=SA[i])<0;++i)SA[i]=~p; | |
| if(i<m){ | |
| for(j=i,++i;;++i){ | |
| if((p=SA[i])<0){ | |
| SA[j++]=~p;SA[i]=0; | |
| if(j==m)break; | |
| } | |
| } | |
| } | |
| /* store the length of all substrings */ | |
| i=n-1;j=n-1;c0=chr(n-1); | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))>=c1)); | |
| for(;0<=i;){ | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))<=c1)); | |
| if(0<=i){ | |
| SA[m+((i+1)>>1)]=j-i; | |
| j=i+1; | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))>=c1)); | |
| } | |
| } | |
| /* find the lexicographic names of all substrings */ | |
| for(i=0,name=0,q=n,qlen=0;i<m;++i){ | |
| p=SA[i],plen=SA[m+(p>>1)],diff=1; | |
| if((plen==qlen)&&((q+plen)<n)){ | |
| for(j=0;(j<plen)&&(chr(p+j)==chr(q+j));++j); | |
| if(j==plen)diff=0; | |
| } | |
| if(diff!=0)++name,q=p,qlen=plen; | |
| SA[m+(p>>1)]=name; | |
| } | |
| return name; | |
| } | |
| inline void LMSsort2(const void *T,sais_index_type *SA,sais_index_type *C,sais_index_type *B,sais_index_type *D,sais_index_type n,sais_index_type k,int cs){ | |
| sais_index_type *b,i,j,t,d; | |
| sais_index_type c0,c1; | |
| /* compute SAl */ | |
| getBuckets(C,B,k,0); /* find starts of buckets */ | |
| j=n-1; | |
| b=SA+B[c1=chr(j)]; | |
| --j; | |
| t=(chr(j)<c1); | |
| j+=n; | |
| *b++=(t&1)?~j:j; | |
| for(i=0,d=0;i<n;++i){ | |
| if(0<(j=SA[i])){ | |
| if(n<=j)d+=1,j-=n; | |
| if((c0=chr(j))!=c1){ | |
| B[c1]=b-SA; | |
| b=SA+B[c1=c0]; | |
| } | |
| --j; | |
| t=c0;t=(t<<1)|(chr(j)<c1); | |
| if(D[t]!=d)j+=n,D[t]=d; | |
| *b++=(t&1)?~j:j; | |
| SA[i]=0; | |
| }else if(j<0)SA[i]=~j; | |
| } | |
| for(i=n-1;0<=i;--i){ | |
| if(0<SA[i]){ | |
| if(SA[i]<n){ | |
| SA[i]+=n; | |
| for(j=i-1;SA[j]<n;--j); | |
| SA[j]-=n; | |
| i=j; | |
| } | |
| } | |
| } | |
| /* compute SAs */ | |
| getBuckets(C,B,k,1); /* find ends of buckets */ | |
| for(i=n-1,d+=1,b=SA+B[c1=0];0<=i;--i){ | |
| if(0<(j=SA[i])){ | |
| if(n<=j)d+=1,j-=n; | |
| if((c0=chr(j))!=c1){ | |
| B[c1]=b-SA; | |
| b=SA+B[c1=c0]; | |
| } | |
| --j; | |
| t=c0;t=(t<<1)|(chr(j)>c1); | |
| if(D[t]!=d)j+=n,D[t]=d; | |
| *--b=(t&1)?~(j+1):j; | |
| SA[i]=0; | |
| } | |
| } | |
| } | |
| inline sais_index_type LMSpostproc2(sais_index_type *SA,sais_index_type n,sais_index_type m){ | |
| sais_index_type i,j,d,name; | |
| /* compact all the sorted LMS substrings into the first m items of SA */ | |
| for(i=0,name=0;(j=SA[i])<0;++i){ | |
| j=~j; | |
| if(n<=j)name+=1; | |
| SA[i]=j; | |
| } | |
| if(i<m){ | |
| for(d=i,++i;;++i){ | |
| if((j=SA[i])<0){ | |
| j=~j; | |
| if(n<=j)name+=1; | |
| SA[d++]=j;SA[i]=0; | |
| if(d==m)break; | |
| } | |
| } | |
| } | |
| if(name<m){ | |
| /* store the lexicographic names */ | |
| for(i=m-1,d=name+1;0<=i;--i){ | |
| if(n<=(j=SA[i]))j-=n,--d; | |
| SA[m+(j>>1)]=d; | |
| } | |
| }else{ | |
| /* unset flags */ | |
| for(i=0;i<m;++i){ | |
| if(n<=(j=SA[i]))j-=n,SA[i]=j; | |
| } | |
| } | |
| return name; | |
| } | |
| /* compute SA and BWT */ | |
| inline void induceSA(const void *T,sais_index_type *SA,sais_index_type *C,sais_index_type *B,sais_index_type n,sais_index_type k,int cs){ | |
| sais_index_type *b,i,j; | |
| sais_index_type c0,c1; | |
| /* compute SAl */ | |
| if(C==B)getCounts(T, C, n, k, cs); | |
| getBuckets(C,B,k,0); /* find starts of buckets */ | |
| j =n-1; | |
| b=SA+B[c1=chr(j)]; | |
| *b++=((0<j)&&(chr(j-1)<c1))?~j:j; | |
| for(i=0;i<n;++i){ | |
| j=SA[i],SA[i]=~j; | |
| if(0<j){ | |
| --j; | |
| if((c0=chr(j))!=c1){ | |
| B[c1]=b-SA; | |
| b=SA+B[c1=c0]; | |
| } | |
| *b++=((0<j)&&(chr(j-1)<c1))?~j:j; | |
| } | |
| } | |
| /* compute SAs */ | |
| if(C==B)getCounts(T,C,n,k,cs); | |
| getBuckets(C,B,k,1); /* find ends of buckets */ | |
| for(i=n-1,b=SA+B[c1=0];0<=i;--i){ | |
| if(0<(j=SA[i])){ | |
| --j; | |
| if((c0=chr(j))!=c1){ | |
| B[c1]=b-SA; | |
| b=SA+B[c1=c0]; | |
| } | |
| *--b=((j==0)||(chr(j-1)>c1))?~j:j; | |
| }else SA[i]=~j; | |
| } | |
| } | |
| inline sais_index_type computeBWT(const void *T,sais_index_type *SA,sais_index_type *C,sais_index_type *B,sais_index_type n,sais_index_type k,int cs){ | |
| sais_index_type *b,i,j,pidx=-1; | |
| sais_index_type c0,c1; | |
| /* compute SAl */ | |
| if(C==B)getCounts(T, C, n, k, cs); | |
| getBuckets(C,B,k,0); /* find starts of buckets */ | |
| j=n-1; | |
| b=SA+B[c1=chr(j)]; | |
| *b++=((0<j)&&(chr(j-1)<c1))?~j:j; | |
| for(i=0;i<n;++i){ | |
| if(0<(j=SA[i])){ | |
| --j; | |
| SA[i]=~((sais_index_type)(c0 = chr(j))); | |
| if(c0!=c1){ | |
| B[c1]=b-SA; | |
| b=SA+B[c1=c0]; | |
| } | |
| *b++=((0<j)&&(chr(j-1)<c1))?~j:j; | |
| }else if(j!=0)SA[i]=~j; | |
| } | |
| /* compute SAs */ | |
| if(C==B)getCounts(T,C,n,k,cs); | |
| getBuckets(C,B,k,1); /* find ends of buckets */ | |
| for(i=n-1,b=SA+B[c1=0];0<=i;--i){ | |
| if(0<(j=SA[i])){ | |
| --j; | |
| SA[i]=(c0=chr(j)); | |
| if(c0!=c1){ | |
| B[c1]=b-SA; | |
| b=SA+B[c1=c0]; | |
| } | |
| *--b=((0<j)&&(chr(j-1)>c1))?~((sais_index_type)chr(j-1)):j; | |
| }else if(j!=0)SA[i]=~j; | |
| else pidx=i; | |
| } | |
| return pidx; | |
| } | |
| /* find the suffix array SA of T[0..n-1] in {0..255}^n */ | |
| sais_index_type sais_main(const void *T,sais_index_type *SA,sais_index_type fs,sais_index_type n,sais_index_type k,int cs,sais_bool_type isbwt){ | |
| sais_index_type *C,*B,*D,*RA,*b; | |
| sais_index_type i,j,m,p,q,t,name,pidx=0,newfs; | |
| sais_index_type c0,c1; | |
| unsigned int flags; | |
| if(k<=MINBUCKETSIZE){ | |
| if((C=SAIS_MYMALLOC(k,sais_index_type))==NULL)return -2; | |
| if(k<=fs){ | |
| B=SA+(n+fs-k); | |
| flags=1; | |
| }else{ | |
| if((B=SAIS_MYMALLOC(k,sais_index_type))==NULL){ | |
| free(C); | |
| return -2; | |
| } | |
| flags=3; | |
| } | |
| }else if(k<=fs){ | |
| C=SA+(n+fs-k); | |
| if(k<=(fs-k)){ | |
| B=C-k; | |
| flags=0; | |
| }else if(k<=(MINBUCKETSIZE*4)){ | |
| if((B=SAIS_MYMALLOC(k,sais_index_type))==NULL)return -2; | |
| flags=2; | |
| }else{ | |
| B=C; | |
| flags=8; | |
| } | |
| }else{ | |
| if((C=B=SAIS_MYMALLOC(k,sais_index_type))==NULL)return -2; | |
| flags=4|8; | |
| } | |
| if((n<=SAIS_LMSSORT2_LIMIT)&&(2<=(n/k))){ | |
| if(flags&1)flags|=((k*2)<=(fs-k))?32:16; | |
| else if((flags==0)&&((k*2)<=(fs-k*2)))flags|=32; | |
| } | |
| /* stage 1: reduce the problem by at least 1/2 | |
| sort all the LMS-substrings */ | |
| getCounts(T,C,n,k,cs);getBuckets(C,B,k,1); /* find ends of buckets */ | |
| for(i=0;i<n;++i)SA[i]=0; | |
| b=&t;i=n-1;j=n;m=0;c0=chr(n-1); | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))>=c1)); | |
| for(;0<=i;){ | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))<=c1)); | |
| if(0<=i){ | |
| *b=j;b=SA+--B[c1];j=i;++m; | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))>=c1)); | |
| } | |
| } | |
| if(1<m){ | |
| if(flags&(16|32)){ | |
| if(flags&16){ | |
| if((D=SAIS_MYMALLOC(k*2,sais_index_type))==NULL){ | |
| if(flags&(1|4))free(C); | |
| if(flags&2)free(B); | |
| return -2; | |
| } | |
| }else D=B-k*2; | |
| ++B[chr(j+1)]; | |
| for(i=0,j=0;i<k;++i){ | |
| j+=C[i]; | |
| if(B[i]!=j)SA[B[i]]+=n; | |
| D[i]=D[i+k]=0; | |
| } | |
| LMSsort2(T,SA,C,B,D,n,k,cs); | |
| name=LMSpostproc2(SA,n,m); | |
| if(flags&16)free(D); | |
| }else{ | |
| LMSsort1(T,SA,C,B,n,k,cs); | |
| name=LMSpostproc1(T,SA,n,m,cs); | |
| } | |
| }else if(m==1){ | |
| *b=j+1; | |
| name=1; | |
| }else name = 0; | |
| /* stage 2: solve the reduced problem | |
| recurse if names are not yet unique */ | |
| if(name<m){ | |
| if(flags&4)free(C); | |
| if(flags&2)free(B); | |
| newfs=(n+fs)-(m*2); | |
| if((flags&(1|4|8))==0){ | |
| if((k+name)<=newfs)newfs-=k; | |
| else flags|=8; | |
| } | |
| RA=SA+m+newfs; | |
| for(i=m+(n>>1)-1,j=m-1;m<=i;--i){ | |
| if(SA[i]!=0)RA[j--]=SA[i]-1; | |
| } | |
| if(sais_main(RA,SA,newfs,m,name,sizeof(sais_index_type),0)!=0){ | |
| if(flags&1)free(C); | |
| return -2; | |
| } | |
| i=n-1;j=m-1;c0=chr(n-1); | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))>=c1)); | |
| for(;0<=i;){ | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))<=c1)); | |
| if(0<=i){ | |
| RA[j--]=i+1; | |
| do{ | |
| c1=c0; | |
| }while((0<=--i)&&((c0=chr(i))>=c1)); | |
| } | |
| } | |
| for(i=0;i<m;++i)SA[i]=RA[SA[i]]; | |
| if(flags&4){ | |
| if((C=B=SAIS_MYMALLOC(k,int))==NULL)return -2; | |
| } | |
| if(flags&2){ | |
| if((B=SAIS_MYMALLOC(k,int))==NULL){ | |
| if(flags&1)free(C); | |
| return -2; | |
| } | |
| } | |
| } | |
| /* stage 3: induce the result for the original problem */ | |
| if(flags&8)getCounts(T,C,n,k,cs); | |
| /* put all left-most S characters into their buckets */ | |
| if(1<m){ | |
| getBuckets(C,B,k,1); /* find ends of buckets */ | |
| i=m-1,j=n,p=SA[m-1],c1=chr(p); | |
| do{ | |
| q=B[c0=c1]; | |
| while(q<j)SA[--j]=0; | |
| do{ | |
| SA[--j]=p; | |
| if(--i<0)break; | |
| p=SA[i]; | |
| }while((c1=chr(p))==c0); | |
| }while(0<=i); | |
| while(0<j)SA[--j]=0; | |
| } | |
| if(isbwt==0)induceSA(T,SA,C,B,n,k,cs); | |
| else pidx=computeBWT(T,SA,C,B,n,k,cs); | |
| if(flags&(1|4))free(C); | |
| if(flags&2)free(B); | |
| return pidx; | |
| } | |
| /*---------------------------------------------------------------------------*/ | |
| inline int sais(const unsigned char *T,int *SA,int n){ | |
| if((T==NULL)||(SA==NULL)||(n<0))return -1; | |
| if(n<=1){ | |
| if(n==1)SA[0]=0; | |
| return 0; | |
| } | |
| return sais_main(T,SA,0,n,UCHAR_SIZE,sizeof(unsigned char),0); | |
| } | |
| inline int sais_int(const int *T,int *SA,int n,int k){ | |
| if((T==NULL)||(SA==NULL)||(n<0)||(k<=0))return -1; | |
| if(n<=1){ | |
| if(n==1)SA[0]=0; | |
| return 0; | |
| } | |
| return sais_main(T,SA,0,n,k,sizeof(int),0); | |
| } | |
| inline int sais_bwt(const unsigned char *T,unsigned char *U,int *A,int n){ | |
| int i,pidx; | |
| if((T==NULL)||(U==NULL)||(A==NULL)||(n<0))return -1; | |
| if(n<=1){ | |
| if(n==1)U[0]=T[0]; | |
| return n; | |
| } | |
| pidx=sais_main(T,A,0,n,UCHAR_SIZE,sizeof(unsigned char),1); | |
| if(pidx<0)return pidx; | |
| U[0]=T[n-1]; | |
| for(i=0;i<pidx;++i)U[i+1]=(unsigned char)A[i]; | |
| for(i+=1;i<n;++i)U[i]=(unsigned char)A[i]; | |
| pidx+=1; | |
| return pidx; | |
| } | |
| inline int sais_int_bwt(const int *T,int *U,int *A,int n,int k){ | |
| int i,pidx; | |
| if((T==NULL)||(U==NULL)||(A==NULL)||(n<0)||(k<=0))return -1; | |
| if(n<=1){ | |
| if(n==1)U[0]=T[0]; | |
| return n; | |
| } | |
| pidx=sais_main(T,A,0,n,k,sizeof(int),1); | |
| if(pidx<0)return pidx; | |
| U[0]=T[n-1]; | |
| for(i=0;i<pidx;++i)U[i+1]=A[i]; | |
| for(i+=1;i<n;++i)U[i]=A[i]; | |
| pidx+=1; | |
| return pidx; | |
| } | |
| char s[10000005]; | |
| int sa[10000005],len; | |
| int main(){ | |
| freopen("in.txt","r",stdin); | |
| gets(s); | |
| int st=clock(); | |
| sais((unsigned char*)s,sa,len=strlen(s)); | |
| printf("%f\n",(double)(clock()-st)/1000); | |
| //for(int i=0;i<len;++i)printf("%d\n",sa[i]); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define pushS(x) sa[--bkt[s[x]]] = x | |
| #define pushL(x) sa[bkt[s[x]]++] = x | |
| #define induce_sort(v){\ | |
| fill_n(sa,n,0);\ | |
| copy_n(_bkt,A,bkt);\ | |
| for(i=n1-1;~i;--i)pushS(v[i]);\ | |
| copy_n(_bkt,A-1,bkt+1);\ | |
| for(i=0;i<n;++i)if(sa[i]&&!t[sa[i]-1])pushL(sa[i]-1);\ | |
| copy_n(_bkt,A,bkt);\ | |
| for(i=n-1;~i;--i)if(sa[i]&&t[sa[i]-1])pushS(sa[i]-1);\ | |
| } | |
| template<typename T> | |
| void sais(const T s,int n,int *sa,int *_bkt,int *p,bool *t,int A){ | |
| int *rnk=p+n,*s1=p+n/2,*bkt=_bkt+A; | |
| int n1=0,i,j,x=t[n-1]=1,y=rnk[0]=-1,cnt=-1; | |
| for(i=n-2;~i;--i)t[i]=(s[i]==s[i+1]?t[i+1]:s[i]<s[i+1]); | |
| for(i=1;i<n;++i)rnk[i]=t[i]&&!t[i-1]?(p[n1]=i,n1++):-1; | |
| fill_n(_bkt,A,0); | |
| for(i=0;i<n;++i)++_bkt[s[i]]; | |
| for(i=1;i<A;++i)_bkt[i]+=_bkt[i-1]; | |
| induce_sort(p); | |
| for(i=0;i<n;++i)if(~(x=rnk[sa[i]])) | |
| j=y<0||memcmp(s+p[x],s+p[y],(p[x+1]-p[x])*sizeof(s[0])) | |
| ,s1[y=x]=cnt+=j; | |
| if(cnt+1<n1)sais(s1,n1,sa,bkt,rnk,t+n,cnt+1); | |
| else for(i=0;i<n1;++i)sa[s1[i]]=i; | |
| for(i=0;i<n1;++i)s1[i]=p[sa[i]]; | |
| induce_sort(s1); | |
| } | |
| /*****************這些是需要用到的陣列大小**************/ | |
| const int MAXN=10000005,MAXA='z'+1; | |
| int sa[MAXN],bkt[MAXN+MAXA],p[MAXN*2]; | |
| bool t[MAXN*2]; | |
| char s[MAXN]; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In SA-IS very fast.cpp, i don't get var flags stand for, can you explain?