/* Driver for Paice/Husk Stemmer program */ /* 06/26/2003 - A. Zamora */ #include #include #include #include #include "rule.h" int readrules(FILE *fp,struct rule ttable[26], int debugsw); struct rule stem1(char *word,struct rule tbl[26], int debugsw, char *trace); int main (int argc, char * argv[]) { /* There are up to 3 arguments: rule file name, input file (optional), output file (optional) */ FILE *outfile1; int i, j, k, len; int wl, rl; int rc; #define MAXLINE 500 char line[MAXLINE]; #define MAXWORD 60 char word[MAXWORD+1]; char *result; #define DNLEN 64 char rulefile[DNLEN]; /* rule file name */ char infile[DNLEN]; char outfile[DNLEN]; FILE *infile1; FILE *rulp; int infilsw = 0; int outfilsw = 0; char *p1; struct rule ttable[26]; struct rule trule; int initialized = 0; int debug = 0; #define TRACELEN 240 char trace[TRACELEN]; /* trace output */ /* Set default rule file name */ strcpy(rulefile,"azrules.txt"); if (argc < 2) { printf("Parameters: \n rule_file, input_file (optional), output_file (optional)\n\n"); printf("Using default rule_file: azrules.txt\n\n"); } if (argc >= 2 ) { strcpy(rulefile, argv[1]); /* rule file name */ } if (argc >= 3 ) { p1 = &infile[0]; for (i=0; i< sizeof(infile)-1; i++ ) { *p1++ = argv[2][i]; if (argv[2][i] == '\0') break; } *p1 = '\0'; /* add null terminator */ /* strcpy(infile, argv[2]); input file name */ infilsw = 1; } outfile[0] = '\0'; /* null output file name */ if (argc >= 4 ) { p1 = &outfile[0]; for (i=0; i< sizeof(outfile)-1; i++ ) { *p1++ = argv[3][i]; if (argv[3][i] == '\0') break; } *p1 = '\0'; /* add null terminator */ /* strcpy(outfile, argv[3]); output file name */ rc = remove(outfile); /* erase file, if present */ outfilsw = 1; outfile1 = fopen(outfile,"a"); /* open in append mode */ if (outfile1 == NULL) return 4; } /* interactive loop */ memset(line, 0, MAXLINE); if (infilsw == 0) { /* Interactive input */ printf("\nInput line of text to be stemmed:\n"); result = fgets(line,MAXLINE-1,stdin); /* read first line */ } else { /* batch input */ infile1 = fopen(infile, "r"); if (infile1 == NULL) return 4; result = fgets(line,MAXLINE-1,infile1); /* read first line */ } while (result != NULL) { /* remove trailing CR, LF, and blank */ i = strlen(line)-1; while (line[i] == '\n' || line[i] == '\r' || line[i] == ' ') line[i--] = '\0'; len = strlen(line); /* Don't do anything for empty lines */ if (len == 0) goto next1; /* convert tabs to blanks */ for (i=0; i < len; i++) { if (line[i] == '\t') line[i] = ' '; } i = 0; /* initialize scan position */ while (i < len) { /* process a line */ /* Tokenize input line */ while (!isalpha(line[i]) && i < len) { /* find word start */ i++; } if (i >= len) break; /* exit while (i < len) */ j = i; k = 0; /* find word end and convert to lower case */ while (isalpha(line[j]) && (j < len) && (k < MAXWORD)) { word[k++] = tolower(line[j++]); } word[k] = '\0'; /* add null terminator */ if (strcmp(word,"debugon") == 0) debug = 1; if (strcmp(word,"debugoff") == 0) debug = 0; if (strcmp(word,"printrules") == 0) debug = 1; if (initialized == 0) { initialized = 1; /* Initialization is within the loop to be able to print the stemming rules based on user input. */ /* If file of stemmer rules is not opened correctly exit. */ if((rulp = fopen(rulefile,"r")) == NULL){ printf("\n*** ERROR *** - Cannot open rule file.\n"); exit(1); } /* Read in a rule set from the file */ rc = readrules(rulp,ttable,debug); fclose(rulp); /* And then close the file */ if (rc > 0) { printf("\n*** ERROR *** - Rule file contains %d error(s).\n",rc); exit (1); } if (strcmp(word,"printrules") == 0) { debug = 0; goto next1; } } /* We have isolated a word, now stem it */ trace[0] = '\0'; /* initialize trace to null */ wl = strlen(word); /* word length */ if (wl > MAXWDSZ) { /* truncate long words */ strncpy(trule.text, word, MAXSTEMSIZE); trule.text[MAXSTEMSIZE] = '\0'; /* terminate string */ } else { trule = stem1(word,ttable,debug,trace); /* Stem the word.. */ } /* print trace to SYSOUT*/ if (trace[0] != '\0') { printf("%s",trace); } /* print in format: "root ing" with space after the root */ rl = strlen(trule.text); /* root length */ if (outfilsw == 0) { /* Interactive input */ /* printf("%s %s %d\n",word, trule.text,trule.rulenum); */ if (wl <= rl) /* root == word */ printf("%s\n",trule.text); else { if (strncmp(trule.text,word,rl) == 0) { /* stem is a substring of the word */ p1 = word + rl; /* address of ending */ printf("%s %s %d\n",trule.text, p1, trule.rulenum); } else { /* stem has characters that are not in original word */ printf("%s %d\n",trule.text, trule.rulenum); } } } else { /* batch input */ if (wl <= rl) /* root == word */ fprintf(outfile1,"%s\n",word); else { if (strncmp(trule.text,word,rl) == 0) { /* stem is a substring of the word */ p1 = word + rl; /* address of ending */ fprintf(outfile1,"%s %s %d\n",trule.text, p1, trule.rulenum); } else { /* stem has characters that are not in original word */ fprintf(outfile1,"%s\n",trule.text); } } } i = j; } /* end while (i < len) */ next1: ; /* we get here for empty lines */ memset(line, 0, MAXLINE); if (infilsw == 0) { /* Interactive input */ printf("\nInput line of text to be stemmed:\n"); result = fgets(line,MAXLINE-1,stdin); /* read line */ } else { /* batch input */ result = fgets(line,MAXLINE-1,infile1); /* read line */ } } /* end while (result != NULL)*/ if (infilsw != 0) { /* batch input */ fclose(infile1); } if (outfilsw != 0) { fclose(outfile1); } return 0; } /* end main */