ksa.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /*
  2. * Copyright (c) 2008 Yuta Mori All Rights Reserved.
  3. * 2011 Attractive Chaos <attractor@live.co.uk>
  4. *
  5. * Permission is hereby granted, free of charge, to any person
  6. * obtaining a copy of this software and associated documentation
  7. * files (the "Software"), to deal in the Software without
  8. * restriction, including without limitation the rights to use,
  9. * copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. * copies of the Software, and to permit persons to whom the
  11. * Software is furnished to do so, subject to the following
  12. * conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be
  15. * included in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  18. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  19. * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  20. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  21. * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  22. * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  24. * OTHER DEALINGS IN THE SOFTWARE.
  25. */
  26. /* This is a library for constructing the suffix array for a string containing
  27. * multiple sentinels with sentinels all represented by 0. The last symbol in
  28. * the string must be a sentinel. The library is modified from an early version
  29. * of Yuta Mori's SAIS library, but is slower than the lastest SAIS by about
  30. * 30%, partly due to the recent optimization Yuta has applied and partly due
  31. * to the extra comparisons between sentinels. This is not the first effort in
  32. * supporting multi-sentinel strings, but is probably the easiest to use. */
  33. #include <stdlib.h>
  34. #ifdef _KSA64
  35. #include <stdint.h>
  36. typedef int64_t saint_t;
  37. #define SAINT_MAX INT64_MAX
  38. #define SAIS_CORE ksa_core64
  39. #define SAIS_BWT ksa_bwt64
  40. #define SAIS_MAIN ksa_sa64
  41. #else
  42. #include <limits.h>
  43. typedef int saint_t;
  44. #define SAINT_MAX INT_MAX
  45. #define SAIS_CORE ksa_core
  46. #define SAIS_BWT ksa_bwt
  47. #define SAIS_MAIN ksa_sa
  48. #endif
  49. /* T is of type "const unsigned char*". If T[i] is a sentinel, chr(i) takes a negative value */
  50. #define chr(i) (cs == sizeof(saint_t) ? ((const saint_t *)T)[i] : (T[i]? (saint_t)T[i] : i - SAINT_MAX))
  51. /** Count the occurrences of each symbol */
  52. static void getCounts(const unsigned char *T, saint_t *C, saint_t n, saint_t k, int cs)
  53. {
  54. saint_t i;
  55. for (i = 0; i < k; ++i) C[i] = 0;
  56. for (i = 0; i < n; ++i) {
  57. saint_t c = chr(i);
  58. ++C[c > 0? c : 0];
  59. }
  60. }
  61. /**
  62. * Find the end of each bucket
  63. *
  64. * @param C occurrences computed by getCounts(); input
  65. * @param B start/end of each bucket; output
  66. * @param k size of alphabet
  67. * @param end compute the end of bucket if true; otherwise compute the end
  68. */
  69. static inline void getBuckets(const saint_t *C, saint_t *B, saint_t k, saint_t end)
  70. {
  71. saint_t i, sum = 0;
  72. if (end) for (i = 0; i < k; ++i) sum += C[i], B[i] = sum;
  73. else for (i = 0; i < k; ++i) sum += C[i], B[i] = sum - C[i];
  74. }
  75. /** Induced sort */
  76. static void induceSA(const unsigned char *T, saint_t *SA, saint_t *C, saint_t *B, saint_t n, saint_t k, saint_t cs)
  77. {
  78. saint_t *b, i, j;
  79. saint_t c0, c1;
  80. /* left-to-right induced sort (for L-type) */
  81. if (C == B) getCounts(T, C, n, k, cs);
  82. getBuckets(C, B, k, 0); /* find starts of buckets */
  83. for (i = 0, b = 0, c1 = -1; i < n; ++i) {
  84. j = SA[i], SA[i] = ~j;
  85. if (0 < j) { /* >0 if j-1 is L-type; <0 if S-type; ==0 undefined */
  86. --j;
  87. if ((c0 = chr(j)) != c1) {
  88. B[c1 > 0? c1 : 0] = b - SA;
  89. c1 = c0;
  90. b = SA + B[c1 > 0? c1 : 0];
  91. }
  92. *b++ = (0 < j && chr(j - 1) < c1) ? ~j : j;
  93. }
  94. }
  95. /* right-to-left induced sort (for S-type) */
  96. if (C == B) getCounts(T, C, n, k, cs);
  97. getBuckets(C, B, k, 1); /* find ends of buckets */
  98. for (i = n - 1, b = 0, c1 = -1; 0 <= i; --i) {
  99. if (0 < (j = SA[i])) { /* the prefix is S-type */
  100. --j;
  101. if ((c0 = chr(j)) != c1) {
  102. B[c1 > 0? c1 : 0] = b - SA;
  103. c1 = c0;
  104. b = SA + B[c1 > 0? c1 : 0];
  105. }
  106. if (c0 > 0) *--b = (j == 0 || chr(j - 1) > c1) ? ~j : j;
  107. } else SA[i] = ~j; /* if L-type, change the sign */
  108. }
  109. }
  110. /**
  111. * Recursively construct the suffix array for a string containing multiple
  112. * sentinels. NULL is taken as the sentinel.
  113. *
  114. * @param T NULL terminated input string (there can be multiple NULLs)
  115. * @param SA output suffix array
  116. * @param fs working space available in SA (typically 0 when first called)
  117. * @param n length of T, including the trailing NULL
  118. * @param k size of the alphabet (typically 256 when first called)
  119. * @param cs # bytes per element in T; 1 or sizeof(saint_t) (typically 1 when first called)
  120. *
  121. * @return 0 upon success
  122. */
  123. int SAIS_CORE(const unsigned char *T, saint_t *SA, saint_t fs, saint_t n, saint_t k, int cs)
  124. {
  125. saint_t *C, *B;
  126. saint_t i, j, c, m, q, qlen, name;
  127. saint_t c0, c1;
  128. /* STAGE I: reduce the problem by at least 1/2 sort all the S-substrings */
  129. if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
  130. else {
  131. if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
  132. B = cs == 1? C + k : C;
  133. }
  134. getCounts(T, C, n, k, cs);
  135. getBuckets(C, B, k, 1); /* find ends of buckets */
  136. for (i = 0; i < n; ++i) SA[i] = 0;
  137. /* mark L and S (the t array in Nong et al.), and keep the positions of LMS in the buckets */
  138. for (i = n - 2, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
  139. if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1); c==1 if in an S run */
  140. else if (c) SA[--B[c1 > 0? c1 : 0]] = i + 1, c = 0;
  141. }
  142. induceSA(T, SA, C, B, n, k, cs);
  143. if (fs < k) free(C);
  144. /* pack all the sorted LMS into the first m items of SA
  145. 2*m must be not larger than n (see Nong et al. for the proof) */
  146. for (i = 0, m = 0; i < n; ++i) {
  147. saint_t p = SA[i];
  148. if (p == n - 1) SA[m++] = p;
  149. else if (0 < p && chr(p - 1) > (c0 = chr(p))) {
  150. for (j = p + 1; j < n && c0 == (c1 = chr(j)); ++j);
  151. if (j < n && c0 < c1) SA[m++] = p;
  152. }
  153. }
  154. for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */
  155. /* store the length of all substrings */
  156. for (i = n - 2, j = n, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
  157. if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1) */
  158. else if (c) SA[m + ((i + 1) >> 1)] = j - i - 1, j = i + 1, c = 0;
  159. }
  160. /* find the lexicographic names of all substrings */
  161. for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
  162. saint_t p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
  163. if (plen == qlen) {
  164. for (j = 0; j < plen && chr(p + j) == chr(q + j); j++);
  165. if (j == plen) diff = 0;
  166. }
  167. if (diff) ++name, q = p, qlen = plen;
  168. SA[m + (p >> 1)] = name;
  169. }
  170. /* STAGE II: solve the reduced problem; recurse if names are not yet unique */
  171. if (name < m) {
  172. saint_t *RA = SA + n + fs - m - 1;
  173. for (i = n - 1, j = m - 1; m <= i; --i)
  174. if (SA[i] != 0) RA[j--] = SA[i];
  175. RA[m] = 0; // add a sentinel; in the resulting SA, SA[0]==m always stands
  176. if (SAIS_CORE((unsigned char *)RA, SA, fs + n - m * 2 - 2, m + 1, name + 1, sizeof(saint_t)) != 0) return -2;
  177. for (i = n - 2, j = m - 1, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
  178. if ((c0 = chr(i)) < c1 + c) c = 1;
  179. else if (c) RA[j--] = i + 1, c = 0; /* get p1 */
  180. }
  181. for (i = 0; i < m; ++i) SA[i] = RA[SA[i+1]]; /* get index */
  182. }
  183. /* STAGE III: induce the result for the original problem */
  184. if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
  185. else {
  186. if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
  187. B = cs == 1? C + k : C;
  188. }
  189. /* put all LMS characters into their buckets */
  190. getCounts(T, C, n, k, cs);
  191. getBuckets(C, B, k, 1); /* find ends of buckets */
  192. for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
  193. for (i = m - 1; 0 <= i; --i) {
  194. j = SA[i], SA[i] = 0;
  195. c = chr(j);
  196. SA[--B[c > 0? c : 0]] = j;
  197. }
  198. induceSA(T, SA, C, B, n, k, cs);
  199. if (fs < k) free(C);
  200. return 0;
  201. }
  202. /**
  203. * Construct the suffix array for a NULL terminated string possibly containing
  204. * multiple sentinels (NULLs).
  205. *
  206. * @param T[0..n-1] NULL terminated input string
  207. * @param SA[0..n-1] output suffix array
  208. * @param n length of the given string, including NULL
  209. * @param k size of the alphabet including the sentinel; no more than 256
  210. * @return 0 upon success
  211. */
  212. int SAIS_MAIN(const unsigned char *T, saint_t *SA, saint_t n, int k)
  213. {
  214. if (T == NULL || SA == NULL || T[n - 1] != '\0' || n <= 0) return -1;
  215. if (k < 0 || k > 256) k = 256;
  216. return SAIS_CORE(T, SA, 0, n, (saint_t)k, 1);
  217. }
  218. int SAIS_BWT(unsigned char *T, saint_t n, int k)
  219. {
  220. saint_t *SA, i;
  221. int ret;
  222. if ((SA = malloc(n * sizeof(saint_t))) == 0) return -1;
  223. if ((ret = SAIS_MAIN(T, SA, n, k)) != 0) return ret;
  224. for (i = 0; i < n; ++i)
  225. if (SA[i]) SA[i] = T[SA[i] - 1]; // if SA[i]==0, SA[i]=0
  226. for (i = 0; i < n; ++i) T[i] = SA[i];
  227. free(SA);
  228. return 0;
  229. }