/* * call-seq: * SuffixArray.new(source) -> SuffixArray * * Given a string (anything like a string really) this will generate a * suffix array for the string so that you can work with it. The * source cannot be an empty string since this is a useless operation. */ static VALUE SuffixArray_initialize(VALUE self, VALUE source) { SuffixArray *sa = NULL; size_t i = 0; Data_Get_Struct(self, SuffixArray, sa); assert(sa != NULL); // get the source string and assign it to our structure sa->source = StringValue(source); // setup temporary variables for the source and length pointers unsigned char *sa_source = RSTRING(sa->source)->ptr; size_t sa_source_len = RSTRING(sa->source)->len; if(sa_source_len == 0) { // we can't have this, so return a nil rb_raise(cSAError, ERR_NO_ZERO_LENGTH_INPUT); } // printf("last char='%c'\n", sa_source[sa_source_len]); // allocate memory for the index integers sa->suffix_index = malloc(sizeof(int) * (sa_source_len + 1)); // create the suffix array from the source int start = bsarray(sa_source, sa->suffix_index, sa_source_len-1); // set the suffix_start in our object rb_iv_set(self, "@suffix_start", INT2NUM(start)); unsigned char c = sa_source[sa->suffix_index[0]]; // start off with the first char in the sarray list sa->starts[c] = 0; for(i = 0; i < sa_source_len; i++) { // skip characters until we see a new one if(sa_source[sa->suffix_index[i]] != c) { sa->ends[c] = i-1; // it's -1 since this is a new character, so the end was actually behind this point c = sa_source[sa->suffix_index[i]]; sa->starts[c] = i; } } // set the last valid character to get the tail of the sa, the loop will miss it c = sa_source[sa->suffix_index[sa_source_len-1]]; sa->ends[c] = sa_source_len-1; return INT2FIX(sa_source_len); }