From 8d39bd1b0590be804c37f1cdf0623a581574aeaa Mon Sep 17 00:00:00 2001 From: heinrich5991 Date: Tue, 11 Feb 2025 18:51:17 +0100 Subject: [PATCH 1/2] Remove `six` dependency Python 2.7 compatibility was dropped in #116, so `six` is not needed anymore. --- README.rst | 3 +-- setup.py | 4 +--- test/test_bufferedreaders.py | 4 +--- test/test_statusandheaders.py | 2 +- test/test_utils.py | 6 ++--- warcio/archiveiterator.py | 5 ++--- warcio/capture_http.py | 4 ++-- warcio/recordbuilder.py | 3 +-- warcio/recordloader.py | 2 -- warcio/statusandheaders.py | 4 +--- warcio/utils.py | 41 +++++------------------------------ 11 files changed, 18 insertions(+), 60 deletions(-) diff --git a/README.rst b/README.rst index 240ab9a2..4153fc34 100644 --- a/README.rst +++ b/README.rst @@ -11,8 +11,7 @@ Background This library provides a fast, standalone way to read and write `WARC Format `__ commonly used in -web archives. Python 3.7+ (minimally only needing -`six `__ as an external dependency) +web archives. Python 3.7+. warcio supports reading and writing of WARC files compliant with both the `WARC 1.0 `__ and `WARC 1.1 `__ ISO standards. diff --git a/setup.py b/setup.py index 2a5fe057..b1368315 100755 --- a/setup.py +++ b/setup.py @@ -19,9 +19,7 @@ provides=[ 'warcio', ], - install_requires=[ - 'six', - ], + install_requires=[], zip_safe=True, entry_points=""" [console_scripts] diff --git a/test/test_bufferedreaders.py b/test/test_bufferedreaders.py index dd49ac6a..47dc3b3f 100644 --- a/test/test_bufferedreaders.py +++ b/test/test_bufferedreaders.py @@ -101,8 +101,6 @@ from contextlib import closing -import six - import zlib import pytest @@ -177,7 +175,7 @@ def test_err_chunk_cut_off(): def print_str(string): - return string.decode('utf-8') if six.PY3 else string + return string.decode('utf-8') diff --git a/test/test_statusandheaders.py b/test/test_statusandheaders.py index e2ff4fc3..1f1df2a3 100644 --- a/test/test_statusandheaders.py +++ b/test/test_statusandheaders.py @@ -91,7 +91,7 @@ from warcio.statusandheaders import StatusAndHeadersParser, StatusAndHeaders -from six import StringIO +from io import StringIO import pytest diff --git a/test/test_utils.py b/test/test_utils.py index b235e00e..0eee7099 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -77,14 +77,14 @@ def test_to_native_str(self): def test_open_exclusive(self): temp_dir = tempfile.mkdtemp('warctest') full_name = os.path.join(temp_dir, 'foo.txt') - with utils.open(full_name, 'xb') as fh: + with open(full_name, 'xb') as fh: fh.write(b'test\r\nfoo') with pytest.raises(OSError): - with utils.open(full_name, 'xb') as fh: + with open(full_name, 'xb') as fh: fh.write(b'test\r\nfoo') - with utils.open(full_name, 'rb') as fh: + with open(full_name, 'rb') as fh: assert fh.read() == b'test\r\nfoo' os.remove(full_name) diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py index 31a69421..7a4b06d1 100644 --- a/warcio/archiveiterator.py +++ b/warcio/archiveiterator.py @@ -6,7 +6,6 @@ from warcio.utils import BUFF_SIZE import sys -import six # ============================================================================ class UnseekableYetTellable: @@ -23,7 +22,7 @@ def read(self, size=-1): return result # ============================================================================ -class ArchiveIterator(six.Iterator): +class ArchiveIterator: """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed @@ -91,7 +90,7 @@ def __iter__(self): return self.the_iter def __next__(self): - return six.next(self.the_iter) + return next(self.the_iter) def close(self): self.record = None diff --git a/warcio/capture_http.py b/warcio/capture_http.py index d83cb277..5f2be306 100644 --- a/warcio/capture_http.py +++ b/warcio/capture_http.py @@ -2,13 +2,13 @@ from io import BytesIO -from six.moves import http_client as httplib +import http.client as httplib from contextlib import contextmanager from array import array -from warcio.utils import to_native_str, BUFF_SIZE, open +from warcio.utils import to_native_str, BUFF_SIZE from warcio.warcwriter import WARCWriter, BufferWARCWriter from tempfile import SpooledTemporaryFile diff --git a/warcio/recordbuilder.py b/warcio/recordbuilder.py index 081b5b11..a215d6bc 100644 --- a/warcio/recordbuilder.py +++ b/warcio/recordbuilder.py @@ -1,4 +1,3 @@ -import six import tempfile from datetime import datetime, timezone @@ -44,7 +43,7 @@ def create_warcinfo_record(self, filename, info): warc_headers.add_header('WARC-Date', self.curr_warc_date()) warcinfo = BytesIO() - for name, value in six.iteritems(info): + for name, value in info.items(): if not value: continue diff --git a/warcio/recordloader.py b/warcio/recordloader.py index 475507ba..c1256564 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -10,8 +10,6 @@ from warcio.timeutils import timestamp_to_iso_date -from six.moves import zip - import logging logger = logging.getLogger(__name__) diff --git a/warcio/statusandheaders.py b/warcio/statusandheaders.py index ce42d662..88d8d353 100644 --- a/warcio/statusandheaders.py +++ b/warcio/statusandheaders.py @@ -2,12 +2,10 @@ Representation and parsing of HTTP-style status + headers """ -from six.moves import range -from six import iteritems from warcio.utils import to_native_str, headers_to_str_headers import uuid -from six.moves.urllib.parse import quote +from urllib.parse import quote import re diff --git a/warcio/utils.py b/warcio/utils.py index 08783f06..0b1c3239 100644 --- a/warcio/utils.py +++ b/warcio/utils.py @@ -1,13 +1,9 @@ -import six import os from contextlib import contextmanager import base64 import hashlib -try: - import collections.abc as collections_abc # only works on python 3.3+ -except ImportError: #pragma: no cover - import collections as collections_abc +import collections.abc BUFF_SIZE = 16384 @@ -17,10 +13,8 @@ def to_native_str(value, encoding='utf-8'): if isinstance(value, str): return value - if six.PY3 and isinstance(value, six.binary_type): #pragma: no cover + if isinstance(value, bytes): return value.decode(encoding) - elif six.PY2 and isinstance(value, six.text_type): #pragma: no cover - return value.encode(encoding) else: return value @@ -48,19 +42,16 @@ def headers_to_str_headers(headers): ''' ret = [] - if isinstance(headers, collections_abc.Mapping): + if isinstance(headers, collections.abc.Mapping): h = headers.items() else: h = headers - if six.PY2: #pragma: no cover - return h - for tup in h: k, v = tup - if isinstance(k, six.binary_type): + if isinstance(k, bytes): k = k.decode('iso-8859-1') - if isinstance(v, six.binary_type): + if isinstance(v, bytes): v = v.decode('iso-8859-1') ret.append((k, v)) return ret @@ -77,25 +68,3 @@ def update(self, buff): def __str__(self): return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest())) - - -#============================================================================= -sys_open = open - -def open(filename, mode='r', **kwargs): #pragma: no cover - """ - open() which supports exclusive mode 'x' in python < 3.3 - """ - if six.PY3 or 'x' not in mode: - return sys_open(filename, mode, **kwargs) - - flags = os.O_EXCL | os.O_CREAT | os.O_WRONLY - if 'b' in mode and hasattr(os, 'O_BINARY'): - flags |= os.O_BINARY - - fd = os.open(filename, flags) - mode = mode.replace('x', 'w') - return os.fdopen(fd, mode, 0x664) - - - From 972d2082db5ef5463f2e10e7b8bb94483785fef6 Mon Sep 17 00:00:00 2001 From: heinrich5991 Date: Tue, 11 Feb 2025 18:51:59 +0100 Subject: [PATCH 2/2] Don't derive from `object` Python 3 doesn't needd that anymore. --- README.rst | 2 +- test/test_archiveiterator.py | 2 +- test/test_capture_http.py | 2 +- test/test_check_digest_examples.py | 2 +- test/test_limitreader.py | 2 +- test/test_utils.py | 2 +- test/test_writer.py | 2 +- warcio/bufferedreaders.py | 2 +- warcio/capture_http.py | 4 ++-- warcio/checker.py | 2 +- warcio/digestverifyingreader.py | 2 +- warcio/extractor.py | 2 +- warcio/indexer.py | 2 +- warcio/limitreader.py | 2 +- warcio/recompressor.py | 2 +- warcio/recordbuilder.py | 2 +- warcio/recordloader.py | 6 +++--- warcio/statusandheaders.py | 4 ++-- warcio/utils.py | 2 +- warcio/warcwriter.py | 2 +- 20 files changed, 24 insertions(+), 24 deletions(-) diff --git a/README.rst b/README.rst index 4153fc34..09cc7de7 100644 --- a/README.rst +++ b/README.rst @@ -58,7 +58,7 @@ the format (ARC or WARC), record type, the record headers, http headers .. code:: python - class ArcWarcRecord(object): + class ArcWarcRecord: def __init__(self, *args): (self.format, self.rec_type, self.rec_headers, self.raw_stream, self.http_headers, self.content_type, self.length) = args diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py index 10914ce5..4dd0fe28 100644 --- a/test/test_archiveiterator.py +++ b/test/test_archiveiterator.py @@ -16,7 +16,7 @@ #============================================================================== -class TestArchiveIterator(object): +class TestArchiveIterator: def _load_archive(self, filename, offset=0, cls=ArchiveIterator, errs_expected=0, **kwargs): diff --git a/test/test_capture_http.py b/test/test_capture_http.py index 6348cb5d..c7d5892a 100644 --- a/test/test_capture_http.py +++ b/test/test_capture_http.py @@ -21,7 +21,7 @@ # ================================================================== -class TestCaptureHttpBin(object): +class TestCaptureHttpBin: @classmethod def setup_class(cls): from httpbin import app as httpbin_app diff --git a/test/test_check_digest_examples.py b/test/test_check_digest_examples.py index 679d7d24..46e30424 100644 --- a/test/test_check_digest_examples.py +++ b/test/test_check_digest_examples.py @@ -21,7 +21,7 @@ def pytest_generate_tests(metafunc): metafunc.parametrize('test_filename', files) -class TestExamplesDigest(object): +class TestExamplesDigest: def check_helper(self, args, expected_exit_value, capsys): exit_value = None try: diff --git a/test/test_limitreader.py b/test/test_limitreader.py index bf2b5c63..eabb9af2 100644 --- a/test/test_limitreader.py +++ b/test/test_limitreader.py @@ -3,7 +3,7 @@ from io import BytesIO -class TestLimitReader(object): +class TestLimitReader: def test_limit_reader_1(self): assert b'abcdefghji' == LimitReader(BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 10).read(26) diff --git a/test/test_utils.py b/test/test_utils.py index 0eee7099..17e8a2ea 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -14,7 +14,7 @@ pass -class TestUtils(object): +class TestUtils: def test_headers_to_str_headers(self): result = [('foo', 'bar'), ('baz', 'barf')] diff --git a/test/test_writer.py b/test/test_writer.py index 597c24f9..441dc624 100644 --- a/test/test_writer.py +++ b/test/test_writer.py @@ -560,7 +560,7 @@ def record_sampler(request): # ============================================================================ -class TestWarcWriter(object): +class TestWarcWriter: @classmethod def _validate_record_content_len(cls, stream): for record in ArchiveIterator(stream, no_record_parse=True): diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 80a46126..cb8b8948 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -37,7 +37,7 @@ def brotli_decompressor(): #================================================================= -class BufferedReader(object): +class BufferedReader: """ A wrapping line reader which wraps an existing reader. Read operations operate on underlying buffer, which is filled to diff --git a/warcio/capture_http.py b/warcio/capture_http.py index 5f2be306..0ecde7a5 100644 --- a/warcio/capture_http.py +++ b/warcio/capture_http.py @@ -19,7 +19,7 @@ # ============================================================================ -class RecordingStream(object): +class RecordingStream: def __init__(self, fp, recorder): self.fp = fp self.recorder = recorder @@ -130,7 +130,7 @@ def putrequest(self, *args, **kwargs): # ============================================================================ -class RequestRecorder(object): +class RequestRecorder: def __init__(self, writer, filter_func=None, record_ip=True): self.writer = writer self.filter_func = filter_func diff --git a/warcio/checker.py b/warcio/checker.py index 56715c86..5c7a328b 100644 --- a/warcio/checker.py +++ b/warcio/checker.py @@ -11,7 +11,7 @@ def _read_entire_stream(stream): break -class Checker(object): +class Checker: def __init__(self, cmd): self.inputs = cmd.inputs self.verbose = cmd.verbose diff --git a/warcio/digestverifyingreader.py b/warcio/digestverifyingreader.py index 985a4ed2..eba2112f 100644 --- a/warcio/digestverifyingreader.py +++ b/warcio/digestverifyingreader.py @@ -7,7 +7,7 @@ # ============================================================================ -class DigestChecker(object): +class DigestChecker: def __init__(self, kind=None): self._problem = [] self._passed = None diff --git a/warcio/extractor.py b/warcio/extractor.py index 58ba4f4e..635474fe 100644 --- a/warcio/extractor.py +++ b/warcio/extractor.py @@ -5,7 +5,7 @@ # ============================================================================ -class Extractor(object): +class Extractor: READ_SIZE = BUFF_SIZE * 4 def __init__(self, filename, offset): diff --git a/warcio/indexer.py b/warcio/indexer.py index 3e9927b5..f0d59fb1 100644 --- a/warcio/indexer.py +++ b/warcio/indexer.py @@ -9,7 +9,7 @@ # ============================================================================ -class Indexer(object): +class Indexer: field_names = {} def __init__(self, fields, inputs, output, verify_http=False): diff --git a/warcio/limitreader.py b/warcio/limitreader.py index bfa6d9e5..4cf9021a 100644 --- a/warcio/limitreader.py +++ b/warcio/limitreader.py @@ -1,5 +1,5 @@ # ============================================================================ -class LimitReader(object): +class LimitReader: """ A reader which will not read more than specified limit """ diff --git a/warcio/recompressor.py b/warcio/recompressor.py index 630db4b4..41b509d0 100644 --- a/warcio/recompressor.py +++ b/warcio/recompressor.py @@ -11,7 +11,7 @@ # ============================================================================ -class Recompressor(object): +class Recompressor: def __init__(self, filename, output, verbose=False): self.filename = filename self.output = output diff --git a/warcio/recordbuilder.py b/warcio/recordbuilder.py index a215d6bc..df2cd17c 100644 --- a/warcio/recordbuilder.py +++ b/warcio/recordbuilder.py @@ -9,7 +9,7 @@ from warcio.utils import to_native_str, BUFF_SIZE, Digester #================================================================= -class RecordBuilder(object): +class RecordBuilder: REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest' REVISIT_PROFILE_1_1 = 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest' diff --git a/warcio/recordloader.py b/warcio/recordloader.py index c1256564..dad4d18d 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -15,7 +15,7 @@ #================================================================= -class ArcWarcRecord(object): +class ArcWarcRecord: def __init__(self, *args, **kwargs): (self.format, self.rec_type, self.rec_headers, self.raw_stream, self.http_headers, self.content_type, self.length) = args @@ -43,7 +43,7 @@ def content_stream(self): #================================================================= -class ArcWarcRecordLoader(object): +class ArcWarcRecordLoader: WARC_TYPES = ['WARC/1.1', 'WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] @@ -267,7 +267,7 @@ def _ensure_target_uri_format(self, rec_headers): #================================================================= -class ARCHeadersParser(object): +class ARCHeadersParser: # ARC 1.0 headers ARC_HEADERS = ["uri", "ip-address", "archive-date", "content-type", "length"] diff --git a/warcio/statusandheaders.py b/warcio/statusandheaders.py index 88d8d353..b629a817 100644 --- a/warcio/statusandheaders.py +++ b/warcio/statusandheaders.py @@ -10,7 +10,7 @@ #================================================================= -class StatusAndHeaders(object): +class StatusAndHeaders: ENCODE_HEADER_RX = re.compile(r'[=]["\']?([^;"]+)["\']?(?=[;]?)') """ Representation of parsed http-style status line and headers @@ -222,7 +222,7 @@ def _strip_count(string, total_read): #================================================================= -class StatusAndHeadersParser(object): +class StatusAndHeadersParser: """ Parser which consumes a stream support readline() to read status and headers and return a StatusAndHeaders object diff --git a/warcio/utils.py b/warcio/utils.py index 0b1c3239..94478184 100644 --- a/warcio/utils.py +++ b/warcio/utils.py @@ -58,7 +58,7 @@ def headers_to_str_headers(headers): # ============================================================================ -class Digester(object): +class Digester: def __init__(self, type_='sha1'): self.type_ = type_ self.digester = hashlib.new(type_) diff --git a/warcio/warcwriter.py b/warcio/warcwriter.py index 6fb71be9..b8c7d085 100644 --- a/warcio/warcwriter.py +++ b/warcio/warcwriter.py @@ -110,7 +110,7 @@ def _write_warc_record(self, out, record): # ============================================================================ -class GzippingWrapper(object): +class GzippingWrapper: def __init__(self, out): self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16) self.out = out