diff --git a/include/mupdf/pdf/document.h b/include/mupdf/pdf/document.h index 0bad2fecfb..fd7052eb30 100644 --- a/include/mupdf/pdf/document.h +++ b/include/mupdf/pdf/document.h @@ -763,6 +763,7 @@ typedef struct int do_use_objstms; /* Use objstms if possible */ int compression_effort; /* 0 for default. 100 = max, 1 = min. */ int do_labels; /* Add labels to each object showing how it can be reached from the Root. */ + int do_strip_invisible_text; /* Strip invisible text (text render mode 3). */ } pdf_write_options; FZ_DATA extern const pdf_write_options pdf_default_write_options; diff --git a/include/mupdf/pdf/interpret.h b/include/mupdf/pdf/interpret.h index 79cb108558..a8c35996e3 100644 --- a/include/mupdf/pdf/interpret.h +++ b/include/mupdf/pdf/interpret.h @@ -362,6 +362,7 @@ typedef struct int (*text_filter)(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox); void (*after_text_object)(fz_context *ctx, void *opaque, pdf_document *doc, pdf_processor *chain, fz_matrix ctm); int (*culler)(fz_context *ctx, void *opaque, fz_rect bbox, fz_cull_type type); + int strip_invisible_text; } pdf_sanitize_filter_options; diff --git a/source/pdf/pdf-op-filter.c b/source/pdf/pdf-op-filter.c index 16efa7cde0..f39d79fab4 100644 --- a/source/pdf/pdf-op-filter.c +++ b/source/pdf/pdf-op-filter.c @@ -635,7 +635,11 @@ filter_show_char(fz_context *ctx, pdf_sanitize_processor *p, int cid, int *unico } *unicode = ucsbuf[0]; - if (p->options->text_filter || p->options->culler) + if (p->options->strip_invisible_text && gstate->pending.text.render == 3) + { + remove = 1; + } + else if (p->options->text_filter || p->options->culler) { fz_matrix ctm; fz_rect bbox; diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c index b5450c08c5..b410c1430d 100644 --- a/source/pdf/pdf-write.c +++ b/source/pdf/pdf-write.c @@ -1721,7 +1721,7 @@ static void complete_signatures(fz_context *ctx, pdf_document *doc, pdf_write_st } } -static void clean_content_streams(fz_context *ctx, pdf_document *doc, int sanitize, int ascii, int newlines) +static void clean_content_streams(fz_context *ctx, pdf_document *doc, int sanitize, int ascii, int newlines, int strip_invisible_text) { int n = pdf_count_pages(ctx, doc); int i; @@ -1733,7 +1733,9 @@ static void clean_content_streams(fz_context *ctx, pdf_document *doc, int saniti options.recurse = 1; options.ascii = ascii; options.newlines = newlines; - options.filters = sanitize ? list : NULL; + options.filters = sanitize || strip_invisible_text ? list : NULL; + if (strip_invisible_text) + sopts.strip_invisible_text = 1; list[0].filter = pdf_new_sanitize_filter; list[0].options = &sopts; @@ -1916,6 +1918,8 @@ pdf_parse_write_options(fz_context *ctx, pdf_write_options *opts, const char *ar opts->do_clean = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "sanitize", &val)) opts->do_sanitize = fz_option_eq(val, "yes"); + if (fz_has_option(ctx, args, "strip-invisible-text", &val)) + opts->do_strip_invisible_text = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "incremental", &val)) opts->do_incremental = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "objstms", &val)) @@ -1990,12 +1994,12 @@ prepare_for_save(fz_context *ctx, pdf_document *doc, const pdf_write_options *in fz_throw(ctx, FZ_ERROR_ARGUMENT, "annotations need resynthesis before saving"); /* Rewrite (and possibly sanitize) the operator streams */ - if (in_opts->do_clean || in_opts->do_sanitize) + if (in_opts->do_clean || in_opts->do_sanitize || in_opts->do_strip_invisible_text) { pdf_begin_operation(ctx, doc, "Clean content streams"); fz_try(ctx) { - clean_content_streams(ctx, doc, in_opts->do_sanitize, in_opts->do_ascii, in_opts->do_pretty); + clean_content_streams(ctx, doc, in_opts->do_sanitize, in_opts->do_ascii, in_opts->do_pretty, in_opts->do_strip_invisible_text); pdf_end_operation(ctx, doc); } fz_catch(ctx) @@ -2722,6 +2726,7 @@ void pdf_write_document(fz_context *ctx, pdf_document *doc, fz_output *out, cons in_opts->do_linear || in_opts->do_clean || in_opts->do_sanitize || + in_opts->do_strip_invisible_text || in_opts->do_appearance || in_opts->do_encrypt != PDF_ENCRYPT_KEEP) fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't use these options when snapshotting!"); @@ -2864,6 +2869,8 @@ pdf_format_write_options(fz_context *ctx, char *buffer, size_t buffer_len, const ADD_OPT("linearize=yes"); if (opts->do_clean) ADD_OPT("clean=yes"); + if (opts->do_strip_invisible_text) + ADD_OPT("strip-invisible-text=yes"); if (opts->do_sanitize) ADD_OPT("sanitize=yes"); if (opts->do_incremental) diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c index 97186fda1c..ff0e91e51c 100644 --- a/source/tools/pdfclean.c +++ b/source/tools/pdfclean.c @@ -61,6 +61,7 @@ static int usage(void) "\t-i\tcompress image streams\n" "\t-c\tclean content streams\n" "\t-s\tsanitize content streams\n" + "\t-I\tstrip invisible text\n" "\t-t\tcompact object syntax\n" "\t-tt\tindented object syntax\n" "\t-L\twrite object labels\n" @@ -133,7 +134,7 @@ int pdfclean_main(int argc, char **argv) opts.write = pdf_default_write_options; opts.write.dont_regenerate_id = 1; - while ((c = fz_getopt_long(argc, argv, "ade:fgilmp:stczDAE:LO:U:P:SZ", longopts)) != -1) + while ((c = fz_getopt_long(argc, argv, "ade:fgilmp:stczDAE:ILO:U:P:SZ", longopts)) != -1) { switch (c) { @@ -149,6 +150,7 @@ int pdfclean_main(int argc, char **argv) case 'l': opts.write.do_linear += 1; break; case 'c': opts.write.do_clean += 1; break; case 's': opts.write.do_sanitize += 1; break; + case 'I': opts.write.do_strip_invisible_text += 1; break; case 't': pretty = (pretty < 0) ? 0 : 1; break; case 'A': opts.write.do_appearance += 1; break; case 'L': opts.write.do_labels = 1; break;