Skip to content

Commit 942504c

Browse files
authored
Improve docs for reading from a file (#95)
* Improve docs for reading from a file * add deps badge * update comment * Use read instead of mmap in benchmark
1 parent 8647ce9 commit 942504c

File tree

4 files changed

+158
-15
lines changed

4 files changed

+158
-15
lines changed

README.md

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
[![CI](https://github.com/JuliaIO/ZipArchives.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/JuliaIO/ZipArchives.jl/actions/workflows/CI.yml)
55
[![codecov](https://codecov.io/gh/JuliaIO/ZipArchives.jl/branch/main/graph/badge.svg?token=K3J0T9BZ42)](https://codecov.io/gh/JuliaIO/ZipArchives.jl)
66
[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
7+
[![deps](https://juliahub.com/docs/General/ZipArchives/stable/deps.svg)](https://juliahub.com/ui/Packages/General/ZipArchives?t=2)
78

89
Read and write Zip archives in Julia.
910

@@ -93,20 +94,10 @@ end
9394
```
9495

9596
### Streaming one entry in a large archive file
96-
If your archive is in a file, `mmap` can be used to treat the file as a `Vector{UInt8}`.
97+
If your archive is in a larger than memory file, `Mmap.jl`
98+
or a custom struct can be used to treat the file as a `AbstractVector{UInt8}`.
9799

98-
An entry in the archive can be opened as an `IO` stream using `zip_openentry`.
99-
100-
```julia
101-
using ZipArchives: ZipReader, zip_openentry
102-
using Downloads: download
103-
using Mmap: mmap
104-
zip_file_path = download("https://github.com/JuliaIO/ZipArchives.jl/archive/refs/heads/main.zip");
105-
archive = ZipReader(mmap(open(zip_file_path)))
106-
readme_n_lines = zip_openentry(archive, "ZipArchives.jl-main/README.md") do io
107-
countlines(io)
108-
end
109-
```
100+
See [test/test_file-array.jl](https://github.com/JuliaIO/ZipArchives.jl/blob/main/test/test_file-array.jl) for an example.
110101

111102
### Supported Compression Methods
112103

@@ -120,7 +111,7 @@ end
120111

121112
1. Cannot directly extract all files in an archive and write those files to disk.
122113
1. Ignores time stamps.
123-
1. Cannot write an archive fully in streaming mode. See https://github.com/madler/zipflow if you need this functionality.
114+
1. Cannot write an archive fully in streaming mode. See https://github.com/madler/zipflow or https://github.com/reallyasi9/ZipStreams.jl if you need this functionality.
124115
1. Encryption and decryption are not supported.
125116
1. Multi disk archives not supported.
126117
1. Cannot recover data from a corrupted archive. Especially if the end of the archive is corrupted.
@@ -149,9 +140,16 @@ Currently, ZipArchives has the following benefits over ZipFile:
149140
ZipArchives currently has the following limitations compared to ZipFile:
150141
1. No way to specify the modification time, times are set to 1980-01-01 00:00:00 DOS date time.
151142
2. No `flush` function for `ZipWriter`. `close` and `zip_append_archive` can be used instead.
152-
3. No way to read an archive from an `IOStream`, `mmap` can be used instead.
143+
3. No way to read an archive from an `IOStream`, instead `Mmap.jl` or a custom struct can be used to treat the file as a `AbstractVector{UInt8}`. Example in [test/test_file-array.jl](https://github.com/JuliaIO/ZipArchives.jl/blob/main/test/test_file-array.jl).
144+
145+
### [ZipStreams](https://github.com/reallyasi9/ZipStreams.jl)
146+
147+
Unlike ZipArchives, ZipStreams is able to read from non-seekable streams, but may fail to correctly
148+
read some pathological archives.
153149

150+
### [LibZip](https://github.com/bhftbootcamp/LibZip.jl)
154151

152+
LibZip is a wrapper of the [libzip C library](https://github.com/nih-at/libzip) which supports encryption and decryption.
155153

156154

157155
## Is there an unzip function for a whole archive?

benchmark/benchmarks.jl

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,68 @@ ZipWriter(sink) do w
2929
end
3030
data = take!(sink)
3131
rbench["empty ZipReader"] = @benchmarkable ZipReader($(data))
32+
33+
# Reading from a file
34+
rfbench = rbench["file"] = BenchmarkGroup()
35+
36+
fname = tempname()
37+
ZipWriter(fname) do w
38+
zip_newfile(w, "test/compressed.txt"; compress=true)
39+
write(w, "Compressed \n Data \n"^1000)
40+
zip_newfile(w, "test/uncompressed.txt")
41+
write(w, "Uncompressed Data \n"^1000)
42+
end
43+
44+
# Based on DiskArrays code suggested by Fabian Gans in
45+
# https://discourse.julialang.org/t/struggling-to-use-mmap-with-ziparchives/129839/19
46+
struct FileArray <: AbstractVector{UInt8}
47+
filename::String
48+
offset::Int64
49+
len::Int64
50+
end
51+
function FileArray(filename::String, offset::Int64=Int64(0))
52+
len = filesize(filename)
53+
len 0 || error("filesize of $(repr(filename)) is negative")
54+
offset 0 || error("offset $(offset) is negative")
55+
offset len || error("offset $(offset) is larger than the filesize $(len)")
56+
FileArray(filename, offset, len-offset)
57+
end
58+
Base.size(s::FileArray) = (s.len,)
59+
function Base.getindex(s::FileArray, i::Int)::UInt8
60+
copyto!(zeros(UInt8, 1), Int64(1), s, Int64(i), Int64(1))[1]
61+
end
62+
function Base.view(s::FileArray, inds::UnitRange{Int64})::FileArray
63+
checkbounds(s, inds)
64+
FileArray(s.filename, s.offset + first(inds) - Int64(1), length(inds))
65+
end
66+
dest_types = if VERSION v"1.11"
67+
(Vector{UInt8}, Memory{UInt8},)
68+
else
69+
(Vector{UInt8},)
70+
end
71+
for dest_type in dest_types
72+
@eval begin
73+
function Base.copyto!(dest::$dest_type, dstart::Int64, src::FileArray, sstart::Int64, n::Int64)
74+
iszero(n) && return dest
75+
n 0 || throw(ArgumentError("tried to copy n=$(n) elements, but n should be non-negative"))
76+
checkbounds(dest, dstart)
77+
checkbounds(src, sstart)
78+
checkbounds(dest, dstart + n - Int64(1))
79+
checkbounds(src, sstart + n - Int64(1))
80+
open(src.filename) do io
81+
seek(io, src.offset + sstart - Int64(1))
82+
nb = readbytes!(io, view(dest, range(dstart; length=n)))
83+
nb == n || error("short read")
84+
end
85+
return dest
86+
end
87+
end
88+
end
89+
90+
r = ZipReader(FileArray(fname))
91+
rfbench["ZipReader FileArray"] = @benchmarkable ZipReader(FileArray($(fname)))
92+
rfbench["zip_test FileArray"] = @benchmarkable zip_test($(r))
93+
94+
r = ZipReader(read(fname))
95+
rfbench["ZipReader read"] = @benchmarkable ZipReader(read($(fname)))
96+
rfbench["zip_test read"] = @benchmarkable zip_test($(r))

test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Random.seed!(1234)
1111
# @test Any[] == detect_ambiguities(Base, Core, ZipArchives)
1212
include("test_bytes2string.jl")
1313
include("test_simple-usage.jl")
14+
include("test_file-array.jl")
1415
include("test_filename-checks.jl")
1516
include("test_show.jl")
1617
include("test_writer.jl")

test/test_file-array.jl

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
using ZipArchives:
2+
ZipWriter,
3+
zip_newfile,
4+
ZipReader,
5+
zip_openentry,
6+
zip_test
7+
8+
using Test: @testset, @test
9+
10+
@testset "Streaming one entry in a large archive file" begin
11+
# create test file
12+
fname = tempname()
13+
ZipWriter(fname) do w
14+
zip_newfile(w, "test/compressed.txt"; compress=true)
15+
write(w, "Compressed \n Data \n"^1000)
16+
zip_newfile(w, "test/uncompressed.txt")
17+
write(w, "Uncompressed Data \n"^1000)
18+
end
19+
20+
# Based on DiskArrays code suggested by Fabian Gans in
21+
# https://discourse.julialang.org/t/struggling-to-use-mmap-with-ziparchives/129839/19
22+
struct FileArray <: AbstractVector{UInt8}
23+
filename::String
24+
offset::Int64
25+
len::Int64
26+
end
27+
function FileArray(filename::String, offset::Int64=Int64(0))
28+
len = filesize(filename)
29+
len 0 || error("filesize of $(repr(filename)) is negative")
30+
offset 0 || error("offset $(offset) is negative")
31+
offset len || error("offset $(offset) is larger than the filesize $(len)")
32+
FileArray(filename, offset, len-offset)
33+
end
34+
Base.size(s::FileArray) = (s.len,)
35+
function Base.getindex(s::FileArray, i::Int)::UInt8
36+
copyto!(zeros(UInt8, 1), Int64(1), s, Int64(i), Int64(1))[1]
37+
end
38+
function Base.view(s::FileArray, inds::UnitRange{Int64})::FileArray
39+
checkbounds(s, inds)
40+
FileArray(s.filename, s.offset + first(inds) - Int64(1), length(inds))
41+
end
42+
dest_types = if VERSION v"1.11"
43+
(Vector{UInt8}, Memory{UInt8},)
44+
else
45+
(Vector{UInt8},)
46+
end
47+
for dest_type in dest_types
48+
@eval begin
49+
function Base.copyto!(dest::$dest_type, dstart::Int64, src::FileArray, sstart::Int64, n::Int64)
50+
iszero(n) && return dest
51+
n 0 || throw(ArgumentError("tried to copy n=$(n) elements, but n should be non-negative"))
52+
checkbounds(dest, dstart)
53+
checkbounds(src, sstart)
54+
checkbounds(dest, dstart + n - Int64(1))
55+
checkbounds(src, sstart + n - Int64(1))
56+
open(src.filename) do io
57+
seek(io, src.offset + sstart - Int64(1))
58+
nb = readbytes!(io, view(dest, range(dstart; length=n)))
59+
nb == n || error("short read")
60+
end
61+
return dest
62+
end
63+
end
64+
end
65+
66+
archive = ZipReader(FileArray(fname))
67+
68+
# validate all the checksums
69+
zip_test(archive)
70+
71+
# `zip_openentry` is used to stream data from the entry as an `IO`.
72+
# This works with both compressed and uncompressed entries.
73+
zip_openentry(archive, "test/compressed.txt") do io
74+
@test countlines(io) == 2000
75+
end
76+
zip_openentry(archive, "test/uncompressed.txt") do io
77+
@test countlines(io) == 1000
78+
end
79+
end

0 commit comments

Comments
 (0)