#include #include #include #include #include #include #include typedef struct BloomFilter { size_t n; /* capacity */ double p; /* false positive rate */ size_t m; /* number of bits in vector */ size_t k; /* number of hash functions */ char v[1]; } BF; BF *BF_new(size_t n, double p) { BF *bf; size_t m, k, nb; assert(n > 0 && p > 0 && p < 1); m = n * log(p) / log(0.6185); k = log(p) / log(0.5); assert(m > 0 && k > 0); nb = sizeof(BF) + m / 8; bf = malloc(nb); assert(bf); memset(bf, 0, nb); bf->n = n; bf->p = p; bf->m = m; bf->k = k; return bf; } BF *BF_load(FILE *fp) { size_t n; BF *bf = malloc(sizeof(BF)); assert(bf); n = fread(bf, sizeof(BF), 1, fp); assert(n == 1); assert(bf->n > 0 && bf->p > 0 && bf->p < 1); assert(bf->m > 0 && bf->k > 0); bf = realloc(bf, sizeof(BF) + bf->m / 8); assert(bf); rewind(fp); n = fread(bf, sizeof(BF) + bf->m / 8, 1, fp); assert(n == 1); return bf; } void BF_save(BF *bf, FILE *fp) { size_t nb = sizeof(BF) + bf->m / 8; size_t n = fwrite(bf, nb, 1, fp); assert(n == 1); } void BF_set(BF *bf, size_t n) { assert(bf->m >= n); bf->v[n / 8] |= (1 << (n % 8)); } int BF_isset(BF *bf, size_t n) { assert(bf->m >= n); return bf->v[n / 8] & (1 << (n % 8)); } static size_t rehash(const char digest[], int i) { size_t hash = digest[(i + 1) % 20] + (digest[(i + 2) % 20] << 8) + (digest[(i + 3) % 20] << 16) + (digest[(i + 4) % 20] << 24); hash ^= digest[(i + 6) % 20] + (digest[(i + 7) % 20] << 8) + (digest[(i + 8) % 20] << 16) + (digest[(i + 9) % 20] << 24); return hash; } void BF_add(BF *bf, const char *str, size_t len) { char digest[20]; int i; SHA1(str, len, digest); for (i = 0; i < bf->k; i++) { size_t hash = rehash(digest, i); BF_set(bf, hash % bf->m); } } int BF_exists(BF *bf, const char *str, size_t len) { char digest[20]; int i; SHA1(str, len, digest); for (i = 0; i < bf->k; i++) { size_t hash = rehash(digest, i); int set = BF_isset(bf, hash % bf->m); if (!set) return 0; } return 1; } int main(int argc, char *argv[]) { size_t n = 1024; double p = 0.01; char *e = NULL; int c; while ((c = getopt(argc, argv, "n:p:e:")) != -1) { switch (c) { case 'n': n = strtoul(optarg, NULL, 10); break; case 'p': p = atof(optarg); break; case 'e': e = optarg; break; default: exit(2); } } if (optind + 1 != argc) { fprintf(stderr, "arg count\n"); exit(2); } if (e) { int exists; FILE *fp = fopen(argv[optind], "r"); assert(fp); BF *bf = BF_load(fp); exists = BF_exists(bf, e, strlen(e)); exit(!exists); } else { char line[1024]; BF *bf = BF_new(n, p); FILE *fp = fopen(argv[optind], "r"); assert(fp); while (fgets(line, sizeof(line), fp)) { int len = strlen(line); if (line[len - 1] == '\n') len--; BF_add(bf, line, len); } BF_save(bf, stdout); } return 0; }