/* Paice/Husk Stemmer Program 1994-5,by Andrew Stark Started 12-9-94. */ /* 7/30/2003 - Antonio Zamora: - allowed comment lines starting with semicolon in rules, - improved diagnostic handling for rules, - replaced acceptable() subroutine with fixed-length MINSTEMSIZE, - introduced trailing 2-digit strings as state markers in rules, - added rule display and debugging option, - modified applyrule() by prescreening rules to return "not applicable" for matches that would create unacceptably short stems, - added return code to readrules(). */ /* 8/10/2003 - AZ - modified stem() to return the rule trace */ #include #include #include #include #include "rule.h" #define maxrulesz 10 /* Maximum length of rule text */ #define maxlinelength 100 /* Maximun length of line read from file */ #define true 1 /* Boolean true */ #define false 0 /* Boolean false */ #define stop 2 /* Stop here */ #define continue 1 /* Continue with next rule */ #define notapply 0 /* Rule did not apply */ #define notintact 3 /* Word no longer intact */ #define blank '?' /* For when replace string is empty */ char addrule(struct rule r,struct rule t[26]); struct rule stem1(char *word,struct rule tbl[26], int debugsw, char *trace); int tblindex(char *s); int applyrule(struct rule *r,char *word,int isintact); int rulewalk(char *word,struct rule t[26],int is_intact,struct rule *used); int isvalidstr(char *s); struct rule makerule(char *s,int line); int readrules(FILE *fp,struct rule ttable[26], int debugsw); int flagerror(char *s); /* * * * * * * * ** * * * * * * * */ /* * * TBLINDEX() * * * * * * * * */ /* * * * * * * * ** * * * * * * * */ int tblindex(char *s){ int x; for(x=0; s[x] != '\0'; x++); /* Read to end of string */ x--; /* Go back one letter to be at end of word */ if ( (s[x] >= 'a') && (s[x] <= 'z') ) return (s[x]-'a'); /* Return number from 0..25 */ else return 25; /* non-alphabetics are combined with 'z' */ } /* * * * * * * * ** * * * * * * * */ /* * * ISVALIDSTR() * * * * * * * */ /* * * * * * * * ** * * * * * * * */ int isvalidstr(char *s){ int x; int slen; if ((s == NULL) || (s[0] == '\0')) return 0; /* null pointer or null string is not valid string */ slen = strlen(s); if (slen >= 2) { /* allow trailing 2-byte numeric */ if ( isdigit(s[slen-1]) && isdigit(s[slen-2]) ) slen = slen - 2; } for(x=0; x < slen ;x++){ /* For each letter in the word... */ /* If it's not lower case or an apostrophe.. */ if(! islower(*(s + x)) && *(s + x) != '\''){ return 0; /* ..then it`s an error.. */ } } return 1; /* ..otherwise,it`s ok */ } /* * * * * * * * ** * * * * * * * */ /* * * ADDRULE() * * * * * * * */ /* * * * * * * * ** * * * * * * * */ char addrule(struct rule r,struct rule t[26]){ int x; struct rule *temp; struct rule *trail; x = tblindex(r.keystr); /* Find out where in table to put it */ trail = (t + x); /* Set trail pointer to address of list header */ /* Walk along linked list with this loop */ for(temp = (t + x)->next;temp != NULL;temp = temp->next){ trail = temp; } /* Make a new instance of struct rule.. */ trail->next = (struct rule *) malloc(sizeof(struct rule)); memcpy(trail->next,&r,sizeof(struct rule)); /* ..copy r into it.. */ trail->next->next = NULL; /* ..and set its "next" pointer to null */ return 1; } /* * * * * * * * ** * * * * * * * */ /* * * APPLYRULE() * * * * * * * */ /* * * * * * * * ** * * * * * * * */ int applyrule(struct rule *r,char *word,int isintact){ /* Apply the rule r to word,leaving results in word.Return stop,continue */ /* or notapply as appropriate */ int x, rl; if(! strcmp(r->text,"dummy")){ /* If it's just a dummy list header.. */ return notapply; /* ..then it automatically doesn't apply */ } if(! isintact && r->intact){ /* If it should be intact,but isn't.. */ return notapply; /* ..then rule fails */ } x = strlen(word) - r->deltotal; /* Find where suffix should start */ /* AZ - the following change speeds up the process: 1) Avoid matching rules for which the suffix is longer than the word. 2) Avoid applying rules that would result in unacceptably short stems. The second condition makes it possible to apply rules of smaller length that are substrings of another rule. E.g, given MINSTEMSIZE = 3 and the rules: tions,?,stop ions,?,stop s,?,stop For the word "actions" 'tions' would have matched and generated the 2 letter stem "ac". However, since this is smaller than MINSTEMSIZE, the rule is considered not applicable and the next rule, 'ions' will apply and generate "act") For "lions", 'ions' won't apply, but 's' applies to generate 'lion' */ rl = strlen(r->repstr); if ((x < 0) || (x + rl < MINSTEMSIZE) ) { /* AZ change */ return notapply; } if(! strcmp(word + x,r->keystr)){ /* If ending matches key string.. */ if (r->protect) return stop; /* If it is protected,then stop */ /* Before replacing, exclude terminal state markers from the replacement string length count to see if stem will be valid. It is more efficient to check here than before the match. */ if (rl >= 2) { if (isdigit(r->repstr[rl-1]) && isdigit(r->repstr[rl-2]) ) rl = rl - 2; } if (x + rl < MINSTEMSIZE ) { return notapply; } /* Replace matching keystr with repstr. */ word[x] = '\0'; /* truncate word at point of match */ if (r->repstr[0] != '\0') strcat(word, r->repstr); /* append repl. string if not null */ /* The original replacement code strcpy() did not work when the length of keystr was equal to length of repstr as noted by Christopher Fox 12/19/00. */ /* strcpy(word + x,r->repstr); ..then swap it for rep string.. */ if(r->cont){ /* If continue flag is set,return cont */ return continue; } return stop; } /* ending matches */ /* Ending does not match */ return notapply; /* ..otherwise,this rule is not applicable */ } /* * * * * * * * ** * * * * * * * */ /* * * RULEWALK() * * * * * * * */ /* * * * * * * * ** * * * * * * * */ int rulewalk(char *word,struct rule t[26],int isintact,struct rule *used){ int x; int result; struct rule *rp; char tempwd[MAXWDSZ]; x = tblindex(word); /* Find out which list to walk along */ strcpy(tempwd,word); /* Copy word for safe keeping */ /* For each rule in list.. */ for(rp = (t + x)->next;rp != NULL;rp = rp->next){ strcpy(tempwd,word); /* Copy word for safe keeping */ /* If rule applied to this word... */ if((result = applyrule(rp,tempwd,isintact)) != notapply){ strcpy(word,tempwd); memcpy(used,rp,sizeof(struct rule)); return result; } } return stop; /* If no rule was used,then we can stop */ } /* * * * * * * * ** * * * * * * * */ /* * * MAKERULE() * * * * * * * */ /* * * * * * * * ** * * * * * * * */ struct rule makerule(char *s,int line){ /* Format is: keystr,repstr,flags where keystr and repstr are strings,and */ /* flags is one of:"protect","intact","continue","protint","contint",or */ /* "stop" (without the inverted commas in the actual file). */ struct rule temp; char *tempkey; char *temprep; char *tempflags; int rl; int error = 0; tempkey = strtok(s,","); /* The unusual(but very useful) strtok */ temprep = strtok(NULL,","); /* function splits the line into fields */ tempflags = strtok(NULL,"\t"); /* delimited by commas (or tab at the end)*/ /* Handle possible errors in... */ /* ..key string.. */ if((tempkey == NULL) || (! isvalidstr(tempkey)) || (strlen(tempkey) >= MAXKEYSZ) ){ printf("Invalid key string:line %d\n",line); error = 1; } /* ..replace string.. */ if((temprep == NULL) || (strlen(temprep) >= MAXSUFFSZ) || (! isvalidstr(temprep) && strcmp(temprep,"?")) ){ printf("Invalid replace string:line %d\n",line); error = 1; } if((tempflags == NULL) ||(flagerror(tempflags))){ /* ..or flag field */ printf("Invalid flag or missing TAB after flag:line %d\n",line); error = 1; } if(error){ /* If there was an error then don't compile this rule */ *temp.keystr = '?'; /* Error signal */ return temp; } /* If there's no replace string,then put a null char instead */ *(temprep) = (*(temprep) == blank) ? '\0' : *(temprep); strcpy(temp.keystr,tempkey); /* Copy key string into the rule struct */ strcpy(temp.repstr,temprep); /* Copy replace string to same place */ if(! strcmp(tempflags,"protect")){ /* If flag field = "protect"... */ temp.cont = false; /* ..set continue to false.. */ temp.protect = true; /* ..set protect to true */ temp.intact = false; /* Guess what? */ } if(! strcmp(tempflags,"intact")){ temp.cont = false; temp.protect = false; temp.intact = true; } if(! strcmp(tempflags,"continue")){ temp.cont = true; temp.protect = false; temp.intact = false; } if(! strcmp(tempflags,"contint")){ temp.cont = true; temp.intact = true; temp.protect = false; } if(! strcmp(tempflags,"protint")){ temp.cont = false; temp.protect = true; temp.intact = true; } if(! strcmp(tempflags,"stop")){ temp.cont = false; temp.protect = false; temp.intact = false; } temp.deltotal = strlen(tempkey); /* Delete total = length of key string */ temp.rulenum = line; /* Line number of rule in file */ /* Check replacement string for special 2-digit markers */ rl = strlen(temprep); if (rl > 1) { if (isdigit(temprep[rl-1]) && (isdigit(temprep[rl-2]))) { if (temp.cont == false) { printf("** WARNING ** State marker may require CONTINUE:line %d\n",line); } } } return temp; } /* * * * * * * * ** * * * * * * * */ /* * * READRULES() * * * * * * * */ /* * * * * * * * ** * * * * * * * */ int readrules(FILE *fp,struct rule ttable[26], int debugsw){ char s[maxlinelength]; char s1[maxlinelength]; int n = 0; struct rule temp; int condition; char con1[10]; int x; int errorct = 0; /* Initialize table */ /* For each item in the list.. */ for(x = 0; x < 26; x++){ (ttable + x)->next = NULL; /* Set the 'next' pointer to NULL,and.. */ strcpy((ttable + x)->text,"dummy"); /* ..set the text field to 'dummy' */ } while(fgets(s,maxlinelength,fp)){/* Read a line at a time until eof */ strcpy(s1,s); /* save input string */ /* Check for comment or blank line */ if ((s[0] == ';') || (s[0] == '\r') || (s[0] == '\n') || (s[0] == ' ')) { if (debugsw) printf("%s",s); } else { /* not a comment line */ n += 1; /* Increment line counter */ temp = makerule(s,n); /* Make line into a rule */ if (*temp.keystr == '?') { /* display saved string */ printf("*** ERROR in line %d: %s\n", n,s1); errorct++; goto nextline; } if (debugsw) { /* Print rule */ condition = 0; if (temp.intact) condition += 1; if (temp.cont) condition += 2; if (temp.protect) condition += 4; /* */ strcpy(con1,"???"); switch (condition) { case 0: strcpy(con1,"stop"); break; case 1: strcpy(con1,"intact"); break; case 2: strcpy(con1,"cont."); break; case 3: strcpy(con1,"contint"); break; case 4: strcpy(con1,"prot."); break; case 6: strcpy(con1,"protint"); break; default: break; } printf("%d (%s)->(%s) %s\n",n,temp.keystr,temp.repstr, con1); } /* Print rule */ addrule(temp,ttable); /* Add rule to the table */ } /* not a comment line */ nextline: ; } /* Read a line at a time until eof */ return errorct; } /* end readrules() */ /* * * * * * * * ** * * * * * * * */ /* * * FLAGERROR() * * * * * * * */ /* * * * * * * * ** * * * * * * * */ int flagerror(char *s){ return (strcmp(s,"continue") && /* If s is not equal to "continue".. */ strcmp(s,"intact") && /* ..or "intact".. */ strcmp(s,"stop") && /* ..or "stop",etc... */ strcmp(s,"protect") && strcmp(s,"protint") && strcmp(s,"contint")); } /* * * * * * * * ** * * * * * * * */ /* * * STEM() * * * * * * * */ /* * * * * * * * ** * * * * * * * */ struct rule stem1(char *s, struct rule ttable[26], int debugsw, char * trace){ /* This is the main entry point of the stemmer */ /* Stem the word,using the given table */ int isintact = 1; int result; char tx[MAXWDSZ]; char trail[MAXWDSZ]; struct rule r; int rnsave; #define TRACELEN 240 int trln; int wordlen, iterations; r.rulenum = 0; /* Initialise rulenum to 0,in case no rule is used */ rnsave = 0; trace[0] = '\0'; /* Initialize trace */ /* If s is an invalid stem before we even start.. */ if((strlen(s) < MINSTEMSIZE) || (! isvalidstr(s))){ strcpy(r.text,s); /* ..then just return word */ if (strlen(r.text) > MAXSTEMSIZE) r.text[MAXSTEMSIZE] = '\0'; return r; } strcpy(tx,s); /* Make a copy of s,so we don't inadvertently alter it*/ strcpy(trail,s); wordlen = strlen(s); /* original word length */ if (debugsw) { trln = sprintf(trace, "<%s> ", tx); } iterations = 0; /* While there is still stemming to be done.. */ while((result = rulewalk(tx,ttable,isintact,&r)) != stop){ isintact = false; /* Because word is no longer intact */ if(strlen(tx) < MINSTEMSIZE){ /* Exit from loop if not acceptable stem */ break; } iterations++; /* Count number of rules executed */ if (iterations > 2*wordlen) { /* If the number of iterations exceeds twice the word length, we are probably in a loop due to defective rules. Exit loop. */ break; } if (debugsw) { if (trln + MAXWDSZ + 10 < TRACELEN) { trln += sprintf(trace+trln, " %d->%s", r.rulenum, tx); } } strcpy(trail,tx); /* Set up trail for next iteration */ rnsave = r.rulenum; r.rulenum = 0; /* Set rule number to zero to be able to check if any rule matched or did not apply */ } if(strlen(tx) < MINSTEMSIZE){ strcpy(tx,trail); /* restore previous good word */ r.rulenum = rnsave; /* Rule has already been printed. */ } else { /* acceptable */ if (r.rulenum != 0) { if (debugsw) { /* trace last rule */ if (trln + MAXWDSZ + 10 < TRACELEN) { trln += sprintf(trace+trln, " %d->%s", r.rulenum, tx); } } } else { r.rulenum = rnsave; } } if (debugsw) trln += sprintf(trace+trln,"\n", tx); /* Package stemmer output along with last rule */ strcpy(r.text,(result == stop) ? tx : trail); /* Remove apostrophe if it exists */ r.text[strcspn(r.text,"\'")] = '\0'; /* Don't return stems longer than 10 bytes (MAXSTEMSIZE) */ /* 10 bytes is specific enough for high-precision retrieval, */ /* but may have false retrievals for long medical terms: */ /* deoxyribon uclease deoxyribon ucleic deoxyribon ucleoprotein */ if (strlen(r.text) > MAXSTEMSIZE) r.text[MAXSTEMSIZE] = '\0'; return r; /* Return the rule,stemmed */ }