zc.c 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <getopt.h>
  5. #define PACKAGE "wgram"
  6. #define VERSION "0.0.4"
  7. #define MAXLINE 1024
  8. #define MAXGRAM 32
  9. /* status epilepticus .. print help */
  10. void print_help(int exval);
  11. int main (int argc, char *argv[]) {
  12. /* word delimeter for strtok() */
  13. char delim[] = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n";
  14. char line[MAXLINE]; /* input buff, fgets() */
  15. char *stray = NULL; /* returned value by strtok() */
  16. char **strarray = NULL; /* array to hold all entrys */
  17. int i = 0; /* general counter */
  18. int strcount = 0; /* number of entrys in pointer array */
  19. int N = 3, pos = 0; /* ngram size, 3 in this case */
  20. int opt = 0; /* holds command line opt nr.. */
  21. int word_flag = 0; /* print only the `raw' words */
  22. FILE *fp = stdin; /* read input from `FILE', default is stdin */
  23. while((opt = getopt(argc, argv, "hvn:wf:")) != -1) {
  24. switch(opt) {
  25. case 'h':
  26. print_help(0);
  27. break;
  28. case 'v':
  29. exit(0);
  30. break;
  31. case 'n':
  32. N = atoi(optarg);
  33. if(N > MAXGRAM || N < 2) {
  34. fprintf(stderr, "%s: Error - Ngram length `%d' out of range `0-%d'\n",
  35. PACKAGE, N, MAXGRAM);
  36. return 1;
  37. }
  38. break;
  39. case 'w':
  40. word_flag = 1;
  41. break;
  42. case 'f':
  43. if(freopen(optarg, "r", fp) == NULL) {
  44. fprintf(stderr, "%s: Error - opening `%s'\n", PACKAGE, optarg);
  45. return 1;
  46. }
  47. break;
  48. case '?':
  49. fprintf(stderr, "%s: Error - No such option: `%c'\n\n", PACKAGE, optopt);
  50. print_help(1);
  51. } /* switch */
  52. } /* while */
  53. /* start reading lines from file pointer, add all entrys to **strarray */
  54. while((fgets(line, MAXLINE, fp)) != NULL) {
  55. if(strlen(line) < 2)
  56. continue;
  57. stray = strtok(line, delim);
  58. while(stray != NULL) {
  59. strarray = (char **)realloc(strarray, (strcount + 1) * sizeof(char *));
  60. strarray[strcount++] = strdup(stray);
  61. stray = strtok(NULL, delim);
  62. }
  63. }
  64. if(word_flag == 0) {
  65. /*
  66. // print the array of strings, jumping back each time
  67. // (N - 1) positions if a whole ngram of words has been printed
  68. */
  69. for(i = 0, pos = N; i < strcount; i++, pos--) {
  70. if(pos == 0) pos = N, i -= (N - 1), printf("\n");
  71. printf("%s ", strarray[i]);
  72. }
  73. printf("\n");
  74. } else {
  75. /* print raw words */
  76. for(i = 0; i < strcount; i++)
  77. printf("%s\n", strarray[i]);
  78. }
  79. /* free the string array */
  80. for(i = 0; i < strcount; i++)
  81. free(strarray[i]);
  82. free(strarray);
  83. return 0;
  84. }
  85. /* status epilepticus .. print help */
  86. void print_help(int exval) {
  87. printf("%s,%s extract N-grams from text data\n", PACKAGE, VERSION);
  88. printf("Usage: %s [-h] [-v] [-n INT] [-w] [-f FILE]\n\n", PACKAGE);
  89. printf(" -h print this help and exit\n");
  90. printf(" -v print version and exit\n\n");
  91. printf(" -n INT set ngram length (default=3)\n");
  92. printf(" -w print only the extracted words\n");
  93. printf(" -f FILE read input from `FILE' (default=stdin)\n\n");
  94. exit(exval);
  95. }