A pbm image filter for scanned multipage documents
I always scan important and unimportant letters into digital documents, whereas the physical papers are archived only for a few months and then going to be dropped. Digital documents can surely better be sorted, categorized and they are digitally searchable. Besides, the next time when I have to move to another apartment, there is no need to carry all papers. However, the scanned documents sometimes have artifacts like false alarms in the empty spaces and white spots within black strokes. In order to fix that, I wrote an image filter to handle them in a quick and dirty way.
2020-01-22

Purpose

A filter for the scanned *.pbm images to improve the scan quality.

Typical Usage

Usually, I let the scanned document go through pdfsizeopt, in order to compress the multipage PDF; When necessary, the first iteration can use this filter to pre-process the PDF in the following way:
pdfsizeopt --use-pngout=NO --use-jbig2=NO --use-sam2p-pr=NO \
--use-image-optimizer="convert %(sourcefnq)s pbm:- | scan-filter | convert pbm:- %(targetfnq)s" \
document.pdf
where the scan-filter is the compiled binary of this program.

Source Code

Even I am proud of my signal processing and digital filter knowledge learned during the college, this filter is only implemented quickly, since I am already satisfied with the processing results.

The following source code can be compiled with any standard C compilers.

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <string.h>
  4 
  5 static size_t bytes_of_row;
  6 static size_t width, height;
  7 static char *buf;
  8 
  9 char get_pixel(int row, int col)
 10 {
 11     const unsigned char byte = buf[row * bytes_of_row + col / 8];
 12     return (byte & (1 << (7-col%8))) ? 1 : 0;
 13 }
 14 
 15 void set_pixel(int row, int col, char val)
 16 {
 17     const size_t byte_index = row * bytes_of_row + col / 8;
 18     const unsigned char byte_mask = 1 << (7-col%8);
 19     if (val) {
 20         buf[byte_index] |= byte_mask;
 21     } else {
 22         buf[byte_index] &= ~byte_mask;
 23     }
 24 }
 25 
 26 void read_image()
 27 {
 28     char magic[10];
 29     scanf("%9s", magic);
 30     if (strcmp(magic, "P4")){
 31         puts("Only the binary pbm is implemented");
 32         exit(2);
 33     }
 34     scanf("%lu%lu", &width, &height);
 35     getchar();
 36     bytes_of_row = (width+7) / 8;
 37     buf = calloc(bytes_of_row, height);
 38     if (fread(buf, bytes_of_row, height, stdin) != height) {
 39         puts("Not enough pixels in the image");
 40         exit(3);
 41     }
 42     if (getchar() != EOF) {
 43         puts("Only one image per file is supported or too much data");
 44         exit(4);
 45     }
 46 }
 47 
 48 size_t filter_dots()
 49 {
 50     size_t row, col;
 51     int counter = 0;
 52     for (row = 1; row < height-1; ++row) {
 53         for (col = 1; col < width-1; ++col) {
 54             if ( get_pixel(row, col) == 1 &&
 55                  get_pixel(row-1, col-1) +
 56                  get_pixel(row-1, col)*2 +
 57                  get_pixel(row-1, col+1) +
 58                  get_pixel(row, col-1)*2 +
 59                  get_pixel(row, col+1)*2 +
 60                  get_pixel(row+1, col-1) +
 61                  get_pixel(row+1, col)*2 +
 62                  get_pixel(row+1, col+1) <= 1
 63                ) {
 64                 set_pixel(row, col, 0);
 65                 ++counter;
 66             }
 67         }
 68     }
 69     fprintf(stderr, "Removed black dot: %d\n", counter);
 70     return counter;
 71 }
 72 
 73 size_t fill_white()
 74 {
 75     size_t row, col;
 76     int counter = 0;
 77     for (row = 1; row < height-1; ++row) {
 78         for (col = 1; col < width-1; ++col) {
 79             if ( get_pixel(row, col)   == 0 &&
 80                  get_pixel(row-1, col-1) +
 81                  get_pixel(row-1, col)*2 +
 82                  get_pixel(row-1, col+1) +
 83                  get_pixel(row, col-1)*2 +
 84                  get_pixel(row, col+1)*2 +
 85                  get_pixel(row+1, col-1) +
 86                  get_pixel(row+1, col)*2 +
 87                  get_pixel(row+1, col+1) >= 8
 88             ) {
 89                 set_pixel(row, col, 1);
 90                 ++counter;
 91             }
 92         }
 93     }
 94     fprintf(stderr, "Filled white dot: %d\n", counter);
 95     return counter;
 96 }
 97 
 98 
 99 void write_image()
100 {
101     printf("P4\n%lu %lu\n", width, height);
102     fwrite(buf, bytes_of_row, height, stdout);
103 }
104 
105 void process()
106 {
107     while (fill_white());
108     while (filter_dots());
109 }
110 
111 int main(int argc, char *argv[])
112 {
113     if (argc > 1) {
114         printf("Usage: %s\n It consumes a pbm from the standard input.", argv[0]);
115         return 1;
116     }
117     read_image();
118     process();
119     write_image();
120     return 0;
121 }

Known Issue With pdfsizeopt

Pdfsizeopt compares the size of all kinds of filtered images. If our filtered one is larger, it will not be updated into the image! To fix this, modify lib/pdfsizeopt/main.py around line 7815:
    def CompareObjInfo(a, b):
        # Instead of the original comparison:
        return -1
        # The original comparison:
        # Compare first by byte size, then by command name.
        # return a[0].__cmp__(b[0]) or CompareStr(a[1], b[1])