diff --git a/README.rst b/README.rst index 9bc16420..ec16452d 100644 --- a/README.rst +++ b/README.rst @@ -368,6 +368,14 @@ of WARC records, if possible. An exit value of 1 indicates a failure. ``warcio check -v`` will print verbose output for each record in the WARC file. +Test +~~~~ + +The ``warcio test`` command will check one or more WARC files against +the WARC standard, giving commentary about standards violations, +recommendations, and other issues. + + Recompress ~~~~~~~~~~ diff --git a/setup.py b/setup.py index 0203bb64..f0390160 100755 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ from setuptools import setup, find_packages from setuptools.command.test import test as TestCommand import glob +import sys __version__ = '1.7.1' @@ -21,6 +22,15 @@ def run_tests(self): errcode = pytest.main(['--doctest-modules', './warcio', '--cov', 'warcio', '-v', 'test/']) sys.exit(errcode) +tests_require = [ + 'pytest', + 'pytest-cov', + 'httpbin==0.5.0', + 'requests', +] +if sys.version_info < (3, 3): + tests_require.append('ipaddress') + setup( name='warcio', version=__version__, @@ -44,12 +54,7 @@ def run_tests(self): """, cmdclass={'test': PyTest}, test_suite='', - tests_require=[ - 'pytest', - 'pytest-cov', - 'httpbin==0.5.0', - 'requests', - ], + tests_require=tests_require, classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Web Environment', diff --git a/test/data/example-digest.warc b/test/data/example-digest-bad.warc similarity index 100% rename from test/data/example-digest.warc rename to test/data/example-digest-bad.warc diff --git a/test/data/example-digest-bad.warc.test b/test/data/example-digest-bad.warc.test new file mode 100644 index 00000000..15a5efaf --- /dev/null +++ b/test/data/example-digest-bad.warc.test @@ -0,0 +1,22 @@ +test/data/example-digest-bad.warc + WARC-Record-ID + WARC-Type request + payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: +global Concurrent-To checks + comment: WARC-Concurrent-To not found: WARC-Concurrent-To diff --git a/test/data/example.warc.test b/test/data/example.warc.test new file mode 100644 index 00000000..52b3c79f --- /dev/null +++ b/test/data/example.warc.test @@ -0,0 +1,16 @@ +test/data/example.warc + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type revisit + digest present but not checked (revisit) + recommendation: Missing recommended header: WARC-Refers-To + comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc new file mode 100644 index 00000000..a928a4c4 --- /dev/null +++ b/test/data/standard-torture-validate-field.warc @@ -0,0 +1,56 @@ +WARC/1.0 +WARC-Target-URI: +WARC-Target-URI: example.com +WARC-Target-URI: ex ample.com +WARC-Target-URI: h<>ttp://example.com/ +WARC-Type: does-not-exist +WARC-Type: CAPITALIZED +WARC-Concurrent-To: http://example.com/ +WARC-Concurrent-To: +WARC-Record-ID: +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +Content-Type: asdf +Content-Type: has space/asdf +Content-Type: asdf/has space +Content-Type: asdf/has space;asdf +WARC-Block-Digest: asdf +WARC-Block-Digest: has space:asdf +WARC-Block-Digest: sha1:&$*^&*^#*&^ +WARC-IP-Address: 1.2.3.4.5 +WARC-Truncated: invalid +WARC-Warcinfo-ID: asdf:asdf +WARC-Filename: not-yet-tested +WARC-Profile: asdf +WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest +WARC-Identified-Payload-Type: asdf +WARC-Segment-Origin-ID: http://example.com +WARC-Segment-Number: not-an-integer +WARC-Segment-Number: 0 +WARC-Segment-Number: 1 +WARC-Segment-Number: 2 +WARC-Segment-Total-Length: 0 +WARC-Segment-Total-Length: not-an-integer +WARC-Refers-To-Target-URI: http://example.com +WARC-Refers-To-Date: not-a-date +WARC-Refers-To-Filename: asdf +WARC-Refers-To-File-Offset: 1234 +WARC-Unknown-Field: asdf +Content-Length: 0 + + +WARC/1.1 +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +WARC-Date: 2017-03-06T04:03:53.0Z +WARC-Type: invalid +Content-Length: 0 + + +WARC/1.1 +WARC-Type: request +WARC-Segment-Number: 1 +Content-Length: 0 + + +WARC/invalid diff --git a/test/data/standard-torture-validate-field.warc.test b/test/data/standard-torture-validate-field.warc.test new file mode 100644 index 00000000..de2e3fe1 --- /dev/null +++ b/test/data/standard-torture-validate-field.warc.test @@ -0,0 +1,80 @@ +test/data/standard-torture-validate-field.warc + WARC-Record-ID + WARC-Type does-not-exist + unknown hash algorithm name in block digest + error: uri must not be within <>: WARC-Target-URI + error: Duplicate field seen: WARC-Target-URI example.com + error: Invalid uri, no scheme: WARC-Target-URI example.com + error: Duplicate field seen: WARC-Target-URI ex ample.com + error: Invalid uri, no scheme: WARC-Target-URI ex ample.com + error: Invalid uri, contains whitespace: WARC-Target-URI ex ample.com + error: Duplicate field seen: WARC-Target-URI h<>ttp://example.com/ + error: Invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/ + error: Duplicate field seen: WARC-Type CAPITALIZED + error: uri must be within <>: WARC-Concurrent-To http://example.com/ + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z + error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z + error: Must contain a /: Content-Type asdf + error: Invalid subtype: Content-Type asdf + error: Duplicate field seen: Content-Type has space/asdf + error: Invalid type: Content-Type has space/asdf + error: Duplicate field seen: Content-Type asdf/has space + error: Invalid subtype: Content-Type asdf/has space + error: Duplicate field seen: Content-Type asdf/has space;asdf + error: Invalid subtype: Content-Type asdf/has space;asdf + error: Missing algorithm: WARC-Block-Digest asdf + error: Duplicate field seen: WARC-Block-Digest has space:asdf + error: Invalid algorithm: WARC-Block-Digest has space:asdf + error: Duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^ + error: Invalid ip: WARC-IP-Address 1.2.3.4.5 + error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf + error: Duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: Must contain a /: WARC-Identified-Payload-Type asdf + error: Invalid subtype: WARC-Identified-Payload-Type asdf + error: uri must be within <>: WARC-Segment-Origin-ID http://example.com + error: Must be an integer: WARC-Segment-Number not-an-integer + error: Duplicate field seen: WARC-Segment-Number 0 + error: Must be 1 or greater: WARC-Segment-Number 0 + error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0 + error: Duplicate field seen: WARC-Segment-Number 1 + error: Duplicate field seen: WARC-Segment-Number 2 + error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 + error: Duplicate field seen: WARC-Segment-Total-Length not-an-integer + error: Must be an integer: WARC-Segment-Total-Length not-an-integer + error: Invalid timestamp: WARC-Refers-To-Date not-a-date + comment: Unknown WARC-Type: WARC-Type does-not-exist + comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED + comment: Unknown WARC-Type: WARC-Type CAPITALIZED + comment: Unknown digest algorithm: WARC-Block-Digest asdf + comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ + comment: Unknown value, perhaps an extension: WARC-Truncated invalid + comment: Unknown value, perhaps an extension: WARC-Profile asdf + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234 + comment: Unknown field, no validation performed: WARC-Unknown-Field asdf + WARC-Record-ID None + WARC-Type invalid + digest not present + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z + comment: Unknown WARC-Type: WARC-Type invalid + WARC-Record-ID None + WARC-Type request + digest not present + error: Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Record-ID + error: Missing required header: WARC-Target-URI + recommendation: Do not segment WARC-Type request + saw exception ArchiveLoadFailed: Invalid WARC record, first line: WARC/invalid + skipping rest of file +global warcinfo checks + comment: WARC-Warcinfo-ID not found: WARC-Warcinfo-ID asdf:asdf +global Concurrent-To checks + comment: WARC-Concurrent-To not found: WARC-Concurrent-To + comment: WARC-Concurrent-To not found: WARC-Concurrent-To http://example.com/ diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc new file mode 100644 index 00000000..da6a2aaf --- /dev/null +++ b/test/data/standard-torture-validate-record.warc @@ -0,0 +1,136 @@ +WARC/1.0 +WARC-Type: warcinfo +Content-Type: application/warc-fields +WARC-Refers-To: probhibited +Content-Length: 146 + + first line can't start with a space +test: invalid utf8 Ã( +test: lines should end with \r\n +foo: + bar + +no colon +token cannot have a space: + + +WARC/1.0 +WARC-Record-ID: +WARC-Type: warcinfo +Content-Type: application/warc-fields +Content-Length: 0 + + +WARC/1.0 +WARC-Type: warcinfo +WARC-Record-ID: +Content-Type: not-application/warc-fields +Content-Length: 5 + +foo + + +WARC/1.0 +WARC-Type: response +WARC-Record-ID: +WARC-Target-URI: HtTp://example.com/ +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: resource +WARC-Record-ID: +WARC-Target-URI: DnS:asdfasdf +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: resource +WARC-Record-ID: +WARC-Test-TODO: add another with valid block +WARC-Target-URI: DnS:asdfasdf +Content-Type: text/dns +Content-Length: 0 + + +WARC/1.0 +WARC-Type: resource +WARC-Record-ID: +WARC-Target-URI: foo:bar +Content-Length: 0 + + +WARC/1.0 +WARC-Type: request +WARC-Record-ID: +WARC-Target-URI: hTtP://example.com/ +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: request +WARC-Record-ID: +WARC-Target-URI: hTtP://example.com/ +WARC-IP-Address: 1.2.3.4 +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: metadata +WARC-Record-ID: +Content-Type: application/warc-fields +Content-Length: 0 + + +WARC/1.0 +WARC-Type: metadata +WARC-Record-ID: +Content-Type: not-application/warc-fields +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Record-ID: +WARC-Profile: none +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Record-ID: +WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Record-ID: +WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified +Content-Length: 0 + + +WARC/1.0 +WARC-Type: conversion +WARC-Record-ID: +Content-Length: 0 + + +WARC/1.0 +WARC-Type: continuation +WARC-Record-ID: +WARC-Segment-Number: 1 +Content-Length: 0 + + +WARC/1.0 +WARC-Type: continuation +WARC-Record-ID: +WARC-Segment-Number: 2 +Content-Length: 0 + + diff --git a/test/data/standard-torture-validate-record.warc.test b/test/data/standard-torture-validate-record.warc.test new file mode 100644 index 00000000..e7b17345 --- /dev/null +++ b/test/data/standard-torture-validate-record.warc.test @@ -0,0 +1,112 @@ +test/data/standard-torture-validate-record.warc + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: uri must be within <>: WARC-Refers-To probhibited + error: Missing required header: WARC-Date + error: Missing required header: WARC-Record-ID + error: Field not allowed in record type: warcinfo WARC-Refers-To + error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte + comment: The first line of warc-fields cannot start with whitespace + comment: warc-fields lines must end with \r\n: test: lines should end with \r\n + comment: Missing colon in warc-fields line: no colon + comment: Invalid warc-fields name: token cannot have a space + WARC-Record-ID + WARC-Type warcinfo + digest not present + error: Missing required header: WARC-Date + comment: warc-fields block present but empty + WARC-Record-ID + WARC-Type warcinfo + digest not present + error: Missing required header: WARC-Date + recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields + WARC-Record-ID + WARC-Type response + digest not present + error: Missing required header: WARC-Date + error: Responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain + error: WARC-IP-Address should be used for http and https responses + error: http/https responses should have http headers + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: WARC-Date + error: resource records for dns shall have Content-Type of text/dns: text/plain + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: WARC-Date + comment: Unknown field, no validation performed: WARC-Test-TODO add another with valid block + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + WARC-Record-ID + WARC-Type request + digest not present + error: Missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest not present + error: Missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain + WARC-Record-ID + WARC-Type metadata + digest not present + error: Missing required header: WARC-Date + comment: warc-fields block present but empty + WARC-Record-ID + WARC-Type metadata + digest not present + error: Missing required header: WARC-Date + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + comment: Unknown value, perhaps an extension: WARC-Profile none + comment: No revisit details validation done due to unknown profile: none + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + error: Missing required header: WARC-Payload-Digest + recommendation: Missing recommended header: WARC-Refers-To + recommendation: Missing recommended header: WARC-Refers-To-Date + recommendation: Missing recommended header: WARC-Refers-To-Target-URI + comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + recommendation: Missing recommended header: WARC-Refers-To + recommendation: Missing recommended header: WARC-Refers-To-Date + WARC-Record-ID + WARC-Type conversion + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + WARC-Record-ID + WARC-Type continuation + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Segment-Origin-ID + error: Missing required header: WARC-Target-URI + error: continuation record must have WARC-Segment-Number > 1: 1 + comment: warcio test continuation code has not been tested, expect bugs + WARC-Record-ID + WARC-Type continuation + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Segment-Origin-ID + error: Missing required header: WARC-Target-URI + comment: warcio test continuation code has not been tested, expect bugs diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py index 10914ce5..7378c7af 100644 --- a/test/test_archiveiterator.py +++ b/test/test_archiveiterator.py @@ -283,6 +283,8 @@ def test_err_arc_iterator_on_warc(self): def test_corrects_wget_bug(self): with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record: assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' + with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record: + assert record.rec_headers.get('WARC-Target-URI') == '' def test_corrects_space_in_target_uri(self): with self._find_first_by_type('example-space-in-target-uri.warc.gz', 'resource') as record: @@ -345,9 +347,9 @@ def test_digests_file(self): expected_t = ['request', 'request', 'request'] # record 1: invalid payload digest - assert self._load_archive('example-digest.warc', check_digests=True) == expected_t - assert self._load_archive('example-digest.warc', check_digests=False) == expected_f + assert self._load_archive('example-digest-bad.warc', check_digests=True) == expected_t + assert self._load_archive('example-digest-bad.warc', check_digests=False) == expected_f # record 2: b64 digest; record 3: b64 filename safe digest - assert self._load_archive('example-digest.warc', offset=922, check_digests=True) == expected_t - assert self._load_archive('example-digest.warc', offset=922, check_digests=False) == expected_t + assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=True) == expected_t + assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=False) == expected_t diff --git a/test/test_check_digest_examples.py b/test/test_check_digest_examples.py index 679d7d24..89eb296f 100644 --- a/test/test_check_digest_examples.py +++ b/test/test_check_digest_examples.py @@ -9,7 +9,8 @@ 'example-iana.org-chunked.warc', 'example-wrong-chunks.warc.gz', 'example-bad-non-chunked.warc.gz', - 'example-digest.warc' + 'example-digest-bad.warc', + 'standard-torture-validate-field.warc', ] @@ -34,7 +35,7 @@ def check_helper(self, args, expected_exit_value, capsys): return capsys.readouterr()[0] # list for py33 support def test_check_invalid(self, capsys): - filenames = [get_test_file('example-digest.warc')] + filenames = [get_test_file('example-digest-bad.warc')] args = ['check'] + filenames value = self.check_helper(args, 1, capsys) diff --git a/test/test_cli.py b/test/test_cli.py index 7bdc87f7..f8330f66 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -90,7 +90,7 @@ def test_check_valid(): def test_check_invalid(): - filenames = [get_test_file('example-digest.warc')] + filenames = [get_test_file('example-digest-bad.warc')] args = ['check'] + filenames value = check_helper(args, 1) @@ -103,7 +103,7 @@ def test_check_invalid(): assert value.count(b'digest pass') == 3 assert value.count(b'WARC-Record-ID') == 4 - files = ['example-bad-non-chunked.warc.gz', 'example-digest.warc'] + files = ['example-bad-non-chunked.warc.gz', 'example-digest-bad.warc'] filenames = [get_test_file(filename) for filename in files] args = ['check'] + filenames value = check_helper(args, 1) diff --git a/test/test_tester.py b/test/test_tester.py new file mode 100644 index 00000000..08963ea9 --- /dev/null +++ b/test/test_tester.py @@ -0,0 +1,96 @@ +from warcio.cli import main +from warcio.utils import to_native_str +import warcio.tester + +from . import get_test_file +from .test_cli import patch_stdout + + +file_map = {} + + +def map_test_file(filename): + file_map[filename] = get_test_file(filename) + return file_map[filename] + + +def helper(args, expected_exit_value): + with patch_stdout() as buff: + exit_value = None + try: + main(args=args) + except SystemExit as e: + exit_value = e.code + finally: + assert exit_value == expected_exit_value + + return to_native_str(buff.getvalue()) + + +def remove_before_test_data(s): + ret = '' + for line in s.splitlines(True): + for filename, value in file_map.items(): + if value in line: + line = line.replace(value, 'test/data/' + filename) + ret += line + return ret + + +def run_one(f): + args = ['test'] + args.append(f) + + with open(f+'.test', 'r') as expectedf: + expected = expectedf.read() + + value = helper(args, 0) + print(remove_before_test_data(value)) + + actual = remove_before_test_data(value) + + assert actual == expected + + +def test_torture(): + files = ['standard-torture-validate-record.warc', + 'standard-torture-validate-field.warc'] + [run_one(map_test_file(filename)) for filename in files] + + +def test_arc(): + files = ['does-not-exist.arc'] + files = [map_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/does-not-exist.arc +""" + + value = helper(args, 0) + assert remove_before_test_data(value) == expected + + +def test_digests(): + # needed for test coverage + files = ['example-digest-bad.warc', 'example.warc'] + [run_one(map_test_file(filename)) for filename in files] + + +def test_leftovers(): + commentary = warcio.recordloader.Commentary() + assert not commentary.has_comments() + + # hard to test because invalid WARC Content-Length raises in archiveiterator + warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) + + # hard to test because warcio raises for unknown WARC version + warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) + + expected = '''\ +error: Must be an integer: Content-Length not-an-integer +''' + + assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py index 484b7f0f..24094936 100644 --- a/warcio/archiveiterator.py +++ b/warcio/archiveiterator.py @@ -56,12 +56,13 @@ class ArchiveIterator(six.Iterator): def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE, - check_digests=False): + check_digests=False, fixup_bugs=True): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, - arc2warc=arc2warc) + arc2warc=arc2warc, + fixup_bugs=fixup_bugs) self.known_format = None self.mixed_arc_warc = arc2warc diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 0b7f72f7..f60ae1a5 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -64,7 +64,8 @@ class BufferedReader(object): def __init__(self, stream, block_size=BUFF_SIZE, decomp_type=None, starting_data=None, - read_all_members=False): + read_all_members=False, + commentary=None): self.stream = stream self.block_size = block_size @@ -77,6 +78,7 @@ def __init__(self, stream, block_size=BUFF_SIZE, self.buff_size = 0 self.read_all_members = read_all_members + self.commentary = commentary def set_decomp(self, decomp_type): self._init_decomp(decomp_type) @@ -88,6 +90,10 @@ def _init_decomp(self, decomp_type): self.decomp_type = decomp_type self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]() except KeyError: + # XXX don't raise? + # we don't know if the enduser cares or not + # or the record might actually be uncompressed + # XXX what does pywb do raise Exception('Decompression type not supported: ' + decomp_type) else: @@ -135,13 +141,15 @@ def _decompress(self, data): if self.decompressor and data: try: data = self.decompressor.decompress(data) - except Exception as e: + except zlib.error as e: # if first read attempt, assume non-gzipped stream if self.num_block_read == 0: if self.decomp_type == 'deflate': self._init_decomp('deflate_alt') data = self._decompress(data) else: + if self.commentary: + self.commentary.comment('Payload claimed to be compressed but apparently is not') self.decompressor = None # otherwise (partly decompressed), something is wrong else: @@ -280,40 +288,43 @@ class ChunkedDataReader(BufferedReader): If at any point the chunked header is not available, the stream is assumed to not be chunked and no more dechunking occurs. """ - def __init__(self, stream, raise_exceptions=False, **kwargs): + def __init__(self, stream, raise_exceptions=False, commentary=None, **kwargs): super(ChunkedDataReader, self).__init__(stream, **kwargs) self.all_chunks_read = False - self.not_chunked = False - - # if False, we'll use best-guess fallback for parse errors + self.not_actually_chunked = False + self.at_start = True self.raise_chunked_data_exceptions = raise_exceptions + self.commentary = commentary def _fillbuff(self, block_size=None): - if self.not_chunked: + if self.not_actually_chunked: return super(ChunkedDataReader, self)._fillbuff(block_size) # Loop over chunks until there is some data (not empty()) # In particular, gzipped data may require multiple chunks to # return any decompressed result - while (self.empty() and - not self.all_chunks_read and - not self.not_chunked): - + while (self.empty() and not self.all_chunks_read): try: length_header = self.stream.readline(64) self._try_decode(length_header) + self.at_start = False except ChunkedDataException as e: if self.raise_chunked_data_exceptions: raise - # Can't parse the data as chunked. # It's possible that non-chunked data is served # with a Transfer-Encoding: chunked. # Treat this as non-chunk encoded from here on. + if self.commentary: + if self.at_start: + self.commentary.comment('Buffer claimed to be chunked, but was not from the start') + else: + self.commentary.comment('Buffer is chunked but there was an unchunking error midway') self._process_read(length_header + e.data) - self.not_chunked = True + self.not_actually_chunked = True + self.at_start = False - # parse as block as non-chunked + # parse as non-chunked return super(ChunkedDataReader, self)._fillbuff(block_size) def _try_decode(self, length_header): @@ -355,6 +366,8 @@ def _try_decode(self, length_header): msg = 'Ran out of data before end of chunk' raise ChunkedDataException(msg, data) else: + if self.commentary: + self.commentary.comment('Chunked reader ran out of data before end of chunk') chunk_size = data_len self.all_chunks_read = True diff --git a/warcio/cli.py b/warcio/cli.py index efdf7c50..bbe51a93 100644 --- a/warcio/cli.py +++ b/warcio/cli.py @@ -4,6 +4,8 @@ from warcio.checker import Checker from warcio.extractor import Extractor from warcio.recompressor import Recompressor +from warcio.tester import Tester +from warcio.utils import BUFF_SIZE import sys @@ -51,6 +53,11 @@ def main(args=None): check.add_argument('-v', '--verbose', action='store_true') check.set_defaults(func=checker) + test = subparsers.add_parser('test', help='WARC standards tester') + test.add_argument('inputs', nargs='+') + test.add_argument('-v', '--verbose', action='store_true') + test.set_defaults(func=tester) + cmd = parser.parse_args(args=args) cmd.func(cmd) @@ -86,6 +93,12 @@ def recompressor(cmd): _recompressor.recompress() +# ============================================================================ +def tester(cmd): + _tester = Tester(cmd) + sys.exit(_tester.process_all()) + + # ============================================================================ if __name__ == "__main__": #pragma: no cover main() diff --git a/warcio/recordloader.py b/warcio/recordloader.py index 05b159df..a78cbaed 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -16,6 +16,36 @@ logger = logging.getLogger(__name__) +#================================================================= +class Commentary(object): + def __init__(self): + self.errors = [] + self.recommendations = [] + self._comments = [] + + def error(self, *args): + self.errors.append(args) + + def recommendation(self, *args): + self.recommendations.append(args) + + def comment(self, *args): + self._comments.append(args) + + def has_comments(self): + if self.errors or self.recommendations or self._comments: + return True + + def comments(self): + # XXX str() all of these, in case an int or other thing slips in? + for e in self.errors: + yield 'error: ' + ' '.join(e) + for r in self.recommendations: + yield 'recommendation: ' + ' '.join(r) + for c in self._comments: + yield 'comment: ' + ' '.join(c) + + #================================================================= class ArcWarcRecord(object): def __init__(self, *args, **kwargs): @@ -23,11 +53,16 @@ def __init__(self, *args, **kwargs): self.http_headers, self.content_type, self.length) = args self.payload_length = -1 self.digest_checker = kwargs.get('digest_checker') + self.commentary = kwargs.get('commentary') + self._content_stream = None def content_stream(self): if not self.http_headers: return self.raw_stream + if self._content_stream: + return self._content_stream + encoding = self.http_headers.get_header('content-encoding') if encoding: @@ -37,11 +72,13 @@ def content_stream(self): encoding = None if self.http_headers.get_header('transfer-encoding') == 'chunked': - return ChunkedDataReader(self.raw_stream, decomp_type=encoding) + self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary) elif encoding: - return BufferedReader(self.raw_stream, decomp_type=encoding) + self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary) else: - return self.raw_stream + self._content_stream = self.raw_stream + + return self._content_stream #================================================================= @@ -58,7 +95,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, verify_http=True, arc2warc=True): + def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -68,6 +105,7 @@ def __init__(self, verify_http=True, arc2warc=True): self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) + self.fixup_bugs = fixup_bugs def parse_record_stream(self, stream, statusline=None, @@ -99,7 +137,7 @@ def parse_record_stream(self, stream, elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') - uri = self._ensure_target_uri_format(rec_headers) + uri = self._ensure_target_uri_format(rec_headers, fixup_bugs=self.fixup_bugs) length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': @@ -125,6 +163,7 @@ def parse_record_stream(self, stream, is_verifying = False digest_checker = DigestChecker(check_digests) + commentary = Commentary() # limit stream to the length for all valid records if length is not None and length >= 0: @@ -149,7 +188,8 @@ def parse_record_stream(self, stream, return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, - content_type, length, digest_checker=digest_checker) + content_type, length, digest_checker=digest_checker, + commentary=commentary) def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None): payload_digest = rec_headers.get_header('WARC-Payload-Digest') @@ -238,7 +278,7 @@ def _detect_type_load_headers(self, stream, msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) - def _ensure_target_uri_format(self, rec_headers): + def _ensure_target_uri_format(self, rec_headers, fixup_bugs=True): """Checks the value for the WARC-Target-URI header field to see if it starts with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present, corrects and updates the field returning the corrected value for the field @@ -252,7 +292,7 @@ def _ensure_target_uri_format(self, rec_headers): """ uri = rec_headers.get_header('WARC-Target-URI') - if uri is not None and uri.startswith('<') and uri.endswith('>'): + if fixup_bugs and uri is not None and uri.startswith('<') and uri.endswith('>'): uri = uri[1:-1] rec_headers.replace_header('WARC-Target-URI', uri) diff --git a/warcio/tester.py b/warcio/tester.py new file mode 100644 index 00000000..2fc8ff9b --- /dev/null +++ b/warcio/tester.py @@ -0,0 +1,866 @@ +from __future__ import print_function + +import re +import sys +import six +from collections import defaultdict + +from warcio.archiveiterator import WARCIterator +from warcio.utils import to_native_str, Digester +from warcio.exceptions import ArchiveLoadFailed +from warcio.bufferedreaders import ChunkedDataException +from warcio.recordloader import Commentary + +class WrapRecord(object): + def __init__(self, obj): + self.obj = obj + self._content = None + + def __getattr__(self, name): + if name == 'content': + if self._content is None: + self._content = self.obj.content_stream().read() + return self._content + if name == 'stream_for_digest_check': + def _doit(): + while True: + piece = self.obj.content_stream().read(1024*1024) + if len(piece) == 0: + break + return _doit + return getattr(self.__dict__['obj'], name) + + +def canon_content_type(s): + # wget omits the space after the ;, let that pass + return s.lower().replace(';msgtype=', '; msgtype=') + + +def validate_warc_fields(record, commentary): + # warc-fields = *named-field CRLF + # named-field = field-name ":" [ field-value ] + # field-value = *( field-content | LWS ) # LWS signals continuations + # field-name = token # token_re + + content = record.content + + if six.PY2: # pragma: no cover + try: + content.decode('utf-8', errors='strict') + text = content # already a str + except UnicodeDecodeError as e: + err = str(e) + err = err.replace('utf8', 'utf-8') # sigh + commentary.error('warc-fields contains invalid utf-8: '+err) + text = content.decode('utf-8', errors='replace') + else: # pragma: no cover + try: + text = to_native_str(content, 'utf-8', errors='strict') + except UnicodeDecodeError as e: + commentary.error('warc-fields contains invalid utf-8: '+str(e)) + text = to_native_str(content, 'utf-8', errors='replace') + + first_line = True + lines = [] + for line in text.splitlines(True): + if not line.endswith('\r\n'): + commentary.comment('warc-fields lines must end with \\r\\n:', line.rstrip()) + line = line.rstrip('\r\n') + else: + line = line[:-2] + + if line.startswith(' ') or line.startswith('\t'): + if first_line: + commentary.comment('The first line of warc-fields cannot start with whitespace') + else: + lines[-1] += ' ' + line[1:] + elif line == '': + # are blank lines prohibited? + pass + else: + # check for field-name : + if ':' not in line: + commentary.comment('Missing colon in warc-fields line:', line) + else: + field_name = line.split(':', 1)[0] + if not re.search(token_re, field_name): + commentary.comment('Invalid warc-fields name:', field_name) + else: + lines.append(line) + first_line = False + + if not lines: + commentary.comment('warc-fields block present but empty') + return + + # XXX check known fields + # warcinfo "but not limited to" + # metadata lacks that langauge + # https://github.com/iipc/warc-specifications/issues/7 + + +def validate_warcinfo(record, commentary, pending): + content_type = record.rec_headers.get_header('Content-Type', 'none') + if content_type.lower() != 'application/warc-fields': + # https://github.com/iipc/warc-specifications/issues/33 -- SHALL BE or recommended? + commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type) + else: + # format: warc-fields + # allowable fields include but not limited to DMCI plus the following + # operator, software, robots, hostname, ip, http-header-user-agent, http-header-from + # if operator present, recommended name or name and email address + # comment if http-user-agent here and in the request or metadata record? + # comment if http-header-from here and in the request? + validate_warc_fields(record, commentary) + + # XXX whole-file tests: + # recommended that all files start with warcinfo + # elsewise allowable for warcinfo to appear anywhere + + +def validate_response(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() + + if target_uri.startswith('http:') or target_uri.startswith('https:'): + content_type = record.rec_headers.get_header('Content-Type', 'none') + if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}: + commentary.error('Responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type) + + if record.rec_headers.get_header('WARC-IP-Address') is None: + commentary.error('WARC-IP-Address should be used for http and https responses') + + if not record.http_headers: + commentary.error('http/https responses should have http headers') + return + + http_content_length = record.http_headers.get_header('Content-Length') + if http_content_length is None: + return + + if not http_content_length.isdigit(): + commentary.comment('http content length header is not an integer', str(http_content_length)) + return + + # We want to verify http_content_length, which is the size of the compressed payload + # Trying to catch that commoncrawl nutch bug that prefixed /r/n to the payload without changing http content-length + + # this blecherous hack is because we need the length of the (possibly compressed) raw stream + # without reading any of it (so that it can be read elsewhere to check the payload digest) + + # XXX fix me before shipping :-D + + if hasattr(record, 'raw_stream'): + if hasattr(record.raw_stream, 'stream'): + if hasattr(record.raw_stream.stream, 'limit'): + if int(http_content_length) != record.raw_stream.stream.limit: + commentary.comment('Actual http payload length is different from http header Content-Length:', + str(record.raw_stream.stream.limit), http_content_length) + # XXX can we say something useful if we are unable to check this length? why would it fail? + + +def validate_resource(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower() + + if target_uri.startswith('dns:'): + content_type = record.rec_headers.get_header('Content-Type', 'none') + if content_type.lower() != 'text/dns': + commentary.error('resource records for dns shall have Content-Type of text/dns:', content_type) + else: + # rfc 2540 and rfc 1035 + #validate_text_dns() + pass + + # should never have http headers + # heuristic of looking for an http status line? and then a blank line?! + + +def validate_request(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() + + if target_uri.startswith('http:') or target_uri.startswith('https:'): + content_type = record.rec_headers.get_header('Content-Type') + + if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}: + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http:', content_type) + + if record.rec_headers.get_header('WARC-IP-Address') is None: + commentary.error('WARC-IP-Address should be used for http and https requests') + + # error: http and https schemes should have http request headers + + # WARC-Concurrent-To field or fields may be used, comment if present but target record is not + + +def validate_metadata(record, commentary, pending): + content_type = record.rec_headers.get_header('Content-Type', 'none') + if content_type.lower() == 'application/warc-fields': + # https://github.com/iipc/warc-specifications/issues/33 SHALL be or not? + # + # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 + # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it + # hopsFromSeed: string + # fetchTimeMs: time in milliseconds, so it's an integer? + validate_warc_fields(record, commentary) + + +def validate_revisit(record, commentary, pending): + warc_profile = record.rec_headers.get_header('WARC-Profile', 'none') + + if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): + config = { + 'required': ['WARC-Payload-Digest'], + 'recommended': ['WARC-Refers-To'], + } + if '/1.1/' in warc_profile: + config['recommended'].extend(('WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date')) + + validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) + # may have record block; + # if not, shall have Content-Length: 0, + # if yes, should be like a response record, truncated FOR LENGTH ONLY if desired + # recommended that server response headers be preserved "in this manner" + # I suppose that means headers are required if there is any content?! + + elif warc_profile.endswith('/revisit/server-not-modified'): + config = { + 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'], + 'prohibited': ['WARC-Payload-Digest'], + } + validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) + # may have content body; + # if not, shall have Content-Length: 0, + # if yes, should be like a response record, truncated if desired + # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present + else: + commentary.comment('No revisit details validation done due to unknown profile:', warc_profile) + + +def validate_conversion(record, commentary, pending): + # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment? + # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To + pass + + +def validate_continuation(record, commentary, pending): + commentary.comment('warcio test continuation code has not been tested, expect bugs') + + segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') + if segment_number.isdigit() and int(segment_number) < 2: + commentary.error('continuation record must have WARC-Segment-Number > 1:', segment_number) + + # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated + + +def validate_unbracketed_uri(field, value, record, version, commentary, pending): + # uri per RFC 3986 + # should use a registered scheme + # %XX encoding, normalize to upper case + # schemes are case-insensitive and normalize to lower + if value.startswith('<') or value.endswith('>'): + # wget 1.19 bug caused by WARC 1.0 spec error + commentary.error('uri must not be within <>:', field, value) + value = value[1:-1] + + scheme = value.split(':', 1)[0] + if ':' not in value: + commentary.error('Invalid uri, no scheme:', field, value) + elif not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): + commentary.error('Invalid uri scheme, bad character:', field, value) + # use https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml ?? + + if re.search(r'\s', value): + commentary.error('Invalid uri, contains whitespace:', field, value) + + +def validate_warc_type(field, value, record, version, commentary, pending): + if not value.islower(): + # I am unclear if this is allowed? standard is silent + commentary.comment('WARC-Type is not lower-case:', field, value) + if value.lower() not in record_types: + # standard says readers should ignore unknown warc-types + commentary.comment('Unknown WARC-Type:', field, value) + + +def validate_bracketed_uri(field, value, record, version, commentary, pending): + # < uri > + if not (value.startswith('<') and value.endswith('>')): + commentary.error('uri must be within <>:', field, value) + return + validate_unbracketed_uri(field, value[1:-1], record, version, commentary, pending) + + +def validate_record_id(field, value, record, version, commentary, pending): + validate_bracketed_uri(field, value, record, version, commentary, pending) + + +def validate_timestamp(field, value, record, version, commentary, pending): + ISO_RE = r'\A\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:.\d{1,9})?Z\Z' + + if not re.match(ISO_RE, value): + commentary.error('Invalid timestamp:', field, value) + + use_ms = False if version <= '1.0' else True + if not use_ms: + if '.' in value: + # specification infelicity: would be nice to have 'advice to implementers' here + commentary.error('WARC versions <= 1.0 may not have timestamps with fractional seconds:', field, value) + + +def validate_content_length(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('Must be an integer:', field, value) + + +token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z' +digest_re = r'\A[A-Za-z0-9/+\-_=]+\Z' + + +def validate_content_type(field, value, record, version, commentary, pending): + if '/' not in value: + commentary.error('Must contain a /:', field, value) + splits = value.split('/', 1) + ctype = splits[0] + if len(splits) > 1: + rest = splits[1] + else: + rest = '' + if not re.search(token_re, ctype): + commentary.error('Invalid type:', field, value) + if ';' in rest: + subtype, rest = rest.split(';', 1) + else: + subtype = rest + if not re.search(token_re, subtype): + commentary.error('Invalid subtype:', field, value) + + # at this point there can be multiple parameters, + # some of which could have quoted string values with ; in them + + +def validate_digest(field, value, record, version, commentary, pending): + if ':' not in value: + commentary.error('Missing algorithm:', field, value) + splits = value.split(':', 1) + algorithm = splits[0] + if len(splits) > 1: + digest = splits[1] + else: + digest = 'none' + if not re.search(token_re, algorithm): + commentary.error('Invalid algorithm:', field, value) + else: + try: + Digester(algorithm) + except ValueError: + commentary.comment('Unknown digest algorithm:', field, value) + if not re.search(token_re, digest): + # https://github.com/iipc/warc-specifications/issues/48 + # commentary.comment('spec incorrectly says this is an invalid digest', field, value) + pass + if not re.search(digest_re, digest): + # suggested in https://github.com/iipc/warc-specifications/issues/48 + commentary.comment('Invalid-looking digest value:', field, value) + + +def validate_ip(field, value, record, version, commentary, pending): + try: + import ipaddress + if six.PY2: # pragma: no cover + value = unicode(value) + ipaddress.ip_address(value) + except ValueError: + commentary.error('Invalid ip:', field, value) + except (ImportError, NameError): # pragma: no cover + commentary.comment('Did not check ip address format, install ipaddress module from pypi if you care') + + +def validate_truncated(field, value, record, version, commentary, pending): + if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: + commentary.comment('Unknown value, perhaps an extension:', field, value) + + +def validate_warcinfo_id(field, value, record, version, commentary, pending): + validate_bracketed_uri(field, value, record, version, commentary, pending) + + +def validate_filename(field, value, record, version, commentary, pending): + # text or quoted-string + # comment for dangerous utf-8 in filename? + pass + + +profiles = { + '0.17': ['http://netpreserve.org/warc/0.17/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/0.17/revisit/server-not-modified'], + '0.18': ['http://netpreserve.org/warc/0.18/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/0.18/revisit/server-not-modified'], + '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.0/revisit/server-not-modified', + 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'], + '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.1/revisit/server-not-modified'], +} +profiles_rev = dict([(filename, version) for version, filenames in profiles.items() for filename in filenames]) + + +def validate_profile(field, value, record, version, commentary, pending): + if version not in profiles: + return + + if value in profiles_rev: + if profiles_rev[value] != version: + commentary.comment('WARC-Profile value is for a different version:', version, value) + else: + commentary.comment('Unknown value, perhaps an extension:', field, value) + + if '/revisit/uri-agnostic-identical-payload-digest' in value: + commentary.comment('This Heretrix extension never made it into the standard:', field, value) + + +def validate_segment_number(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('Must be an integer:', field, value) + return + iv = int(value) + if iv == 0: + commentary.error('Must be 1 or greater:', field, value) + + rec_type = record.rec_headers.get_header('WARC-Type', 'none') + if rec_type != 'continuation': + if iv != 1: + commentary.error('Non-continuation records must always have WARC-Segment-Number: 1:', field, value) + origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID') + if origin_id is None: + commentary.error('Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID') + if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: + commentary.recommendation('Do not segment WARC-Type', rec_type) + + +def validate_segment_total_length(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('Must be an integer:', field, value) + + +def validate_refers_to_filename(field, value, record, version, commentary, pending): + commentary.comment('This Heretrix extension never made it into the standard:', field, value) + + +def validate_refers_to_file_offset(field, value, record, version, commentary, pending): + commentary.comment('This Heretrix extension never made it into the standard:', field, value) + + +warc_fields = { + 'WARC-Type': { + 'validate': validate_warc_type, + }, + 'WARC-Record-ID': { + 'validate': validate_record_id, + }, + 'WARC-Date': { + 'validate': validate_timestamp, + }, + 'Content-Length': { + 'validate': validate_content_length, + }, + 'Content-Type': { + 'validate': validate_content_type, + }, + 'WARC-Concurrent-To': { + 'validate': validate_bracketed_uri, + }, + 'WARC-Block-Digest': { + 'validate': validate_digest, + }, + 'WARC-Payload-Digest': { + 'validate': validate_digest, + }, + 'WARC-IP-Address': { + 'validate': validate_ip, + }, + 'WARC-Refers-To': { + 'validate': validate_bracketed_uri, + }, + 'WARC-Target-URI': { + 'validate': validate_unbracketed_uri, + }, + 'WARC-Truncated': { + 'validate': validate_truncated, + }, + 'WARC-Warcinfo-ID': { + 'validate': validate_warcinfo_id, + }, + 'WARC-Filename': { + 'validate': validate_filename, + }, + 'WARC-Profile': { + 'validate': validate_profile, + }, + 'WARC-Identified-Payload-Type': { + # see also https://github.com/iipc/warc-specifications/issues/49 -- odd that it's allowed for request, revisit, continuation + 'validate': validate_content_type, + }, + 'WARC-Segment-Origin-ID': { + 'validate': validate_bracketed_uri, + }, + 'WARC-Segment-Number': { + 'validate': validate_segment_number, + }, + 'WARC-Segment-Total-Length': { + 'validate': validate_segment_total_length, + }, + 'WARC-Refers-To-Target-URI': { + 'validate': validate_unbracketed_uri, + 'minver': '1.1', + }, + 'WARC-Refers-To-Date': { + 'validate': validate_timestamp, + 'minver': '1.1', + }, + 'WARC-Refers-To-Filename': { + 'validate': validate_refers_to_filename, + }, + 'WARC-Refers-To-File-Offset': { + 'validate': validate_refers_to_file_offset, + }, +} +warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()]) + +record_types = { + 'warcinfo': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'], + 'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'validate': validate_warcinfo, + }, + 'response': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_response, + }, + 'resource': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_resource, + }, + 'request': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'validate': validate_request, + }, + 'metadata': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'], + 'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'validate': validate_metadata, + }, + 'revisit': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI', 'WARC-Profile'], + 'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID', # normal optionals + 'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], # these are for profiles + 'prohibited': ['WARC-Filename'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'validate': validate_revisit, + }, + 'conversion': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_conversion, + }, + 'continuation': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'], + 'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'], + 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile', 'WARC-Identified-Payload-Type'], + 'validate': validate_continuation, + }, +} + + +def make_header_set(config, kinds): + ret = set() + for kind in kinds: + ret = ret.union(set([x.lower() for x in config.get(kind, [])])) + return ret + + +def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): + for req in sorted(config.get('required', [])): + if not rec_headers.get_header(req): + commentary.error('Missing required header:', req) + for rec in sorted(config.get('recommended', [])): + if not rec_headers.get_header(rec): + commentary.recommendation('Missing recommended header:', rec) + allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored')) + prohibited = make_header_set(config, ('prohibited',)) + + for field, value in rec_headers.headers: # XXX not exported + fl = field.lower() + if fl in prohibited: + commentary.error('Field not allowed in record type:', rec_type, field) + elif allow_all or fl in allowed: + pass + elif fl in warc_fields: # pragma: no cover (this is a tester.py configuration omission) + commentary.comment('Known field, but not expected for this record type:', rec_type, field) + else: + # an 'unknown field' comment has already been issued in validate_record + pass + + +def validate_record_against_rec_type(config, record, commentary, pending): + if 'validate' in config: + config['validate'](record, commentary, pending) + + +def validate_record(record): + version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported + + commentary = record.commentary + pending = None + + seen_fields = set() + for field, value in record.rec_headers.headers: # XXX not exported + field_l = field.lower() + if field_l != 'warc-concurrent-to' and field_l in seen_fields: + commentary.error('Duplicate field seen:', field, value) + seen_fields.add(field_l) + if field_l not in warc_fields: + commentary.comment('Unknown field, no validation performed:', field, value) + continue + config = warc_fields[field_l] + if 'minver' in config: + if version < config['minver']: + commentary.comment('Field was introduced after this warc version:', version, field, value) + if 'validate' in config: + config['validate'](field, value, record, version, commentary, pending) + + rec_type = record.rec_headers.get_header('WARC-Type') + if rec_type not in record_types: + # we print a comment for this elsewhere + pass + else: + validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary) + validate_record_against_rec_type(record_types[rec_type], record, commentary, pending) + + return commentary + + +def save_global_info(record, warcfile, commentary, all_records, concurrent_to): + record_id = record.rec_headers.get_header('WARC-Record-ID') + if record_id is None: + return + + for field, value in record.rec_headers.headers: # XXX not exported + if field.lower() == 'warc-concurrent-to': + if record_id is not None and value is not None: + concurrent_to[record_id].append(value) + concurrent_to[value].append(record_id) + + save = {'warcfile': warcfile} + + saved_fields = ( + 'WARC-Type', 'WARC-Warcinfo-ID', 'WARC-Date' + 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Payload-Digest', 'WARC-Target-URI', + 'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Segment-Total-Length', 'WARC-Truncated' + ) + saved_fields = set([x.lower() for x in saved_fields]) + + for field, value in record.rec_headers.headers: # XXX not exported + field_l = field.lower() + if field_l in saved_fields and value is not None: + save[field_l] = value + if field_l == 'warc-concurrent-to': + if 'warc-concurrent-to' not in save: + save['warc-concurrent-to'] = [] + save['warc-concurrent-to'].append(value) + + if record_id in all_records: + if warcfile != all_records[record_id]['warcfile']: + commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile']) + else: + commentary.error('Duplicate WARC-Record-ID:', record_id) + else: + all_records[record_id] = save + + +def check_global(all_records, concurrent_to): + check_global_warcinfo(all_records) + check_global_concurrent_to(all_records, concurrent_to) + check_global_refers_to(all_records) + check_global_segment(all_records) + + +def _print_global(header, commentary): + if commentary.has_comments(): + print(header) + for c in commentary.comments(): + print(' ', c) + + +def check_global_warcinfo(all_records): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-warcinfo-id' in fields: + wanted_id = fields['warc-warcinfo-id'] + if wanted_id not in all_records or all_records[wanted_id]['warc-type'] != 'warcinfo': + commentary.comment('WARC-Warcinfo-ID not found:', record_id, 'WARC-Warcinfo-ID', wanted_id) + + _print_global('global warcinfo checks', commentary) + + +def check_global_concurrent_to(all_records, concurrent_to): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-concurrent-to' in fields: + whole_set = set(fields['warc-concurrent-to']) + del fields['warc-concurrent-to'] + while True: + current_set = list(whole_set) + for c in current_set: + if c in all_records and 'warc-concurrent-to' in all_records[c]: + whole_set.update(set(all_records[c]['warc-concurrent-to'])) + del all_records[c]['warc-concurrent-to'] + if len(whole_set) == len(current_set): + break + warc_date = fields.get('warc-date') + for wanted_id in sorted(whole_set): + if wanted_id not in all_records: + commentary.comment('WARC-Concurrent-To not found:', record_id, 'WARC-Concurrent-To', wanted_id) + else: + new_date = all_records[wanted_id].get('warc-date') + if warc_date != new_date: + commentary.comment('WARC-Concurrent-To set has conflicting dates:', + record_id, warc_date, wanted_id, new_date) + + _print_global('global Concurrent-To checks', commentary) + + +def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, target_field, commentary): + if source_field.lower() not in fields: + return + + if target_field.lower() not in all_records[wanted_id]: + commentary.comment('Revisit target lacks field:', wanted_id, target_field) + return + + source_value = fields[source_field.lower()] + target_value = all_records[wanted_id][target_field.lower()] + if source_value != target_value: + commentary.comment('Revisit and revisit target disagree:', + record_id, source_field, source_value, + wanted_id, target_field, target_value) + + +def check_global_refers_to(all_records): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-refers-to' not in fields: + continue + + wanted_id = fields['warc-refers-to'] + if wanted_id not in all_records: + commentary.comment('WARC-Refers-To target not found:', record_id, 'Warc-Refers-To', wanted_id) + continue + + rec_type = fields.get('warc-type') + if rec_type != 'revisit': + continue + + _revisit_compare(record_id, fields, 'WARC-Refers-To-Target-URI', + wanted_id, all_records, 'WARC-Target-URI', commentary) + _revisit_compare(record_id, fields, 'WARC-Refers-To-Date', + wanted_id, all_records, 'WARC-Date', commentary) + _revisit_compare(record_id, fields, 'WARC-Payload-Digest', + wanted_id, all_records, 'WARC-Payload-Digest', commentary) + + _print_global('global Refers-To checks', commentary) + + +def check_global_segment(all_records): + # warc-segment-origin-id :: exists, is warc-segment-number 1 + # all segments exist, and the last one has WARC-Segment-Total-Length + # and only the last one has WARC-Truncated, if any + + # Segmentation shall not be used if a record can be stored in an existing warc file + # The origin segment shall be placed in a new warc file preceded only by a warcinfo record (if any) + + pass + + +def _process_one(warcfile, all_records, concurrent_to, verbose): + if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): + return + with open(warcfile, 'rb') as stream: + for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): + record = WrapRecord(record) + digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or + record.rec_headers.get_header('WARC-Block-Digest')) + record_id = record.rec_headers.get_header('WARC-Record-ID') + rec_type = record.rec_headers.get_header('WARC-Type') + + validate_record(record) + record.stream_for_digest_check() + + commentary = record.commentary + save_global_info(record, warcfile, commentary, all_records, concurrent_to) + + if verbose or commentary.has_comments() or record.digest_checker.passed is False: + print(' ', 'WARC-Record-ID', record_id) + print(' ', 'WARC-Type', rec_type) + if record.digest_checker.passed is True: + print(' digest pass') + elif record.digest_checker.passed is None: + if digest_present: + if rec_type == 'revisit': + print(' digest present but not checked (revisit)') + else: # pragma: no cover + # should not ever happen + # example reason: WARC record missing Content-Length: header, but that case raises + print(' digest present but not checked') + else: + print(' digest not present') + for p in record.digest_checker.problems: + print(' ', p) + + if commentary.has_comments(): + for c in commentary.comments(): + print(' ', c) + + +class Tester(object): + def __init__(self, cmd): + self.inputs = cmd.inputs + self.verbose = cmd.verbose + self.exit_value = 0 + self.all_records = defaultdict(dict) + self.concurrent_to = defaultdict(list) + + def process_all(self): + for warcfile in self.inputs: + print(warcfile) + try: + self.process_one(warcfile) + except ArchiveLoadFailed as e: + print(' saw exception ArchiveLoadFailed: '+str(e).rstrip()) + print(' skipping rest of file') + + check_global(self.all_records, self.concurrent_to) + + return self.exit_value + + def process_one(self, warcfile): + _process_one(warcfile, self.all_records, self.concurrent_to, self.verbose) diff --git a/warcio/utils.py b/warcio/utils.py index 08783f06..fb544cff 100644 --- a/warcio/utils.py +++ b/warcio/utils.py @@ -13,14 +13,14 @@ # #=========================================================================== -def to_native_str(value, encoding='utf-8'): +def to_native_str(value, encoding='utf-8', errors='strict'): if isinstance(value, str): return value if six.PY3 and isinstance(value, six.binary_type): #pragma: no cover - return value.decode(encoding) + return value.decode(encoding, errors) elif six.PY2 and isinstance(value, six.text_type): #pragma: no cover - return value.encode(encoding) + return value.encode(encoding, errors) else: return value