/* Word count program for unlimited size of input file. (c) Kang Seung-Shik at Kookmin university, June 11, 2004 */ #include #include #include typedef long ULONG; typedef unsigned char UCHAR; ULONG nBYTES; /* ÀÔ·Â ÆÄÀÏÀ» memory¿¡ loadÇÏ¿© Æ÷ÀÎÅ͸¦ return */ UCHAR *load_text(char *filename) { UCHAR *p; FILE *fp=fopen(filename, "rb"); if (!fp) return NULL; // file open failed fseek(fp, 0L, 2); nBYTES = ftell(fp); /* n: byte size of file 'fp' */ p = (UCHAR *) malloc(nBYTES+100); /* memory allocation */ if (p == NULL) return NULL; fseek(fp, 0L, 0); fread((char *) p, sizeof(char), (int)nBYTES, fp); /* read 'fp' to 'p' */ *(p+nBYTES) = '\0'; fclose(fp); return (UCHAR *) p; } int myisspace(UCHAR c) { return (c == ' ' || c == '\t' || c == 0x0d || c == 0x0a); } int myispunct(UCHAR c) { return (c == '.' || c == ',' || c == '?' || c == '!'); } /* °¢ ¾îÀýÀ» ÀνÄÇÏ¿© ¾îÀý ³¡¿¡ '\0'¸¦ settingÇÏ°í, ¾îÀý °³¼ö¸¦ countÇÏ¿© return */ ULONG count_words(UCHAR *text) { ULONG n=0; UCHAR *p=text; while (*p && p-text < nBYTES) { // while (*p && (myisspace(*p) || ispunct(*p))) { while (*p && myisspace(*p)) { *p = ' '; p++; // skip blanks } while (*p && !myisspace(*p)) p++; while (*p && myispunct(*(p-1))) *p-- = ' '; if (p > text && !myisspace(*(p-1))) { *p++ = '\0'; n++; } else { *p = ' '; } } return n; } /* ¾îÀý p°¡ word[i]¿¡ Á¸ÀçÇÏ´ÂÁö °Ë»çÇÏ¿©, Á¸ÀçÇÏ¸é ¾îÀý count Áõ°¡½ÃÅ´. */ ULONG check_word(UCHAR *p, UCHAR *word[], ULONG idx[], ULONG cnt[], ULONG k) { ULONG low=0, cur=0, upp=k; int flag; while ( low < upp ) { cur = (upp + low) / 2; flag = strcmp((char*)p, (char*)word[idx[cur]]); if (!flag) { cnt[idx[cur]]++; // ¾îÀý count Áõ°¡ return -1; } else if (flag > 0) { low = cur + 1; } else upp = cur; } return (low > cur ? low : cur); } ULONG check_word_and_count(ULONG nth, UCHAR *p, UCHAR *word[], ULONG idx[], ULONG cnt[], ULONG n) { ULONG i, j; if (*p == '\0') return n; j = check_word(p, word, idx, cnt, n); // ¾îÀý ¸®½ºÆ®¿¡ Á¸ÀçÇÏ´ÂÁö, Á¸ÀçÇϸé count Áõ°¡ if (j >= 0) { // new word¿¡ ´ëÇØ ÃʱâÈ­ for (i=n; i > j; i--) idx[i] = idx[i-1]; idx[j] = nth; cnt[idx[j]] = 1; return n+1; // ¾îÀý °³¼ö°¡ 1°³ ´Ã¾î³µÀ½. } return n; } /* text¿¡ ÀÖ´Â °¢ ¾îÀýµé¿¡ ´ëÇØ word count */ ULONG word_count(UCHAR *text, ULONG n, UCHAR *word[], ULONG idx[], ULONG cnt[]) { ULONG i, k=0; UCHAR *p=text; for (i=0; i < n; i++) { while (myisspace(*p)) p++; word[i] = p; k = check_word_and_count(i, p, word, idx, cnt, k); p += strlen((char*)p) + 1; } return k; } /* word count ¼øÀ¸·Î sorting */ void sort(ULONG idx[], ULONG cnt[], ULONG n) { ULONG i, j, temp; for (i=0; i < n-1; i++) { for (j=i+1; j < n; j++) { if (cnt[idx[i]] < cnt[idx[j]]) { temp = idx[i]; idx[i] = idx[j]; idx[j] = temp; } } } } main(int argc, char *argv[]) { UCHAR *text; // ÀÔ·Â ÆÄÀÏ Àüü¸¦ ¿©±â¿¡ load ÇÑ ÈÄ¿¡ °¢ ¾îÀý ³¡À» null¹®ÀÚ·Î setting UCHAR **word; // text¿¡ ÀÖ´Â ¾îÀý ½ºÆ®¸µ¿¡ ´ëÇÑ Æ÷ÀÎÅÍ ULONG *cnt; // cnt[i] --> word count for word[i] ULONG *idx; // index of sorting order for word[i] ULONG i; ULONG n; // textÀÇ ¾îÀý °³¼ö ULONG n2=0; // Áߺ¹À» Á¦¿ÜÇÑ ¾îÀý °³¼ö int nargs=argc; if (argc >= 2 && argv[1][0] == '-') nargs--; if (nargs == 1) { puts("$ run [-s] "); return 0; } text = load_text(argv[argc-1]); n = count_words(text); cnt = (ULONG *) calloc(n, sizeof(ULONG)); idx = (ULONG *) calloc(n, sizeof(ULONG)); word = (UCHAR **) calloc(n, sizeof(UCHAR *)); if (!text || !cnt || !word) { puts("Error: memory allocation."); return 0; } for (i=0; i < n; i++) { cnt[i] = 0L; word[i] = NULL; } n2 = word_count(text, n, word, idx, cnt); if (strstr(argv[1], "-s")) sort(idx, cnt, n2); for (i=0; i < n2; i++) printf("%5d: %s\n", cnt[idx[i]], word[idx[i]]); free(text); free(cnt); free(word); free(idx); return 0; }