From 896d63cd3d950bc1d4f26eaa92dc73b120d52e74 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 20 Oct 2025 19:21:03 +0530 Subject: [PATCH 01/15] Added Huggingface support --- Project.toml | 14 ++++++ src/HealthSampleData.jl | 6 +++ src/huggingface.jl | 100 ++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 20 ++++++++ 4 files changed, 140 insertions(+) create mode 100644 src/huggingface.jl create mode 100644 test/runtests.jl diff --git a/Project.toml b/Project.toml index 9df3048..11d15d4 100644 --- a/Project.toml +++ b/Project.toml @@ -5,7 +5,21 @@ version = "0.0.1" [deps] DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" +HuggingFaceHub = "d0076355-e2c0-48e6-a044-05906e51b7fc" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] DataDeps = "0.7" +Downloads = "1.6.0" +FilePathsBase = "0.9.24" +HuggingFaceHub = "0.1.2" +Logging = "1.11.0" +Pkg = "1.11.0" +Random = "1.11.0" +Test = "1.11.0" julia = "1" diff --git a/src/HealthSampleData.jl b/src/HealthSampleData.jl index f59108a..51756cf 100644 --- a/src/HealthSampleData.jl +++ b/src/HealthSampleData.jl @@ -1,7 +1,13 @@ module HealthSampleData using DataDeps + using HuggingFaceHub + using Logging + using FilePathsBase: isfile, joinpath + using Downloads + using Random + include("huggingface.jl") include("OMOP_Common_Data_Model/data.jl") end diff --git a/src/huggingface.jl b/src/huggingface.jl new file mode 100644 index 0000000..e306de8 --- /dev/null +++ b/src/huggingface.jl @@ -0,0 +1,100 @@ +const HF = HuggingFaceHub + +""" + _huggingface_dataset_register(name::String, repo::String, filename::String) + +Resolve dataset metadata from Hugging Face, download `filename` via HuggingFaceHub, +register a DataDep pointing at the HF URL, and return the local filesystem path +to the downloaded file. Displays download progress when possible. +""" +function _huggingface_dataset_register(name::String, repo::String, filename::String) + @info "Resolving Huggingface Metadata for $repo" + dataset = HF.info(HF.Dataset, repo) + + last_pct = Ref(-1) + progress_fn = function(downloaded::Integer, total::Integer) + if total > 0 + safe_downloaded = min(downloaded, total) + pct = clamp(Int(floor(100 * safe_downloaded / total)), 0, 100) + if pct != last_pct[] + last_pct[] = pct + downloaded_mb = round(safe_downloaded / 1024^2; digits=1) + total_mb = round(total / 1024^2; digits=1) + @info "Download progress: $pct% ($downloaded_mb MB / $total_mb MB)" + end + else + downloaded_mb = round(downloaded / 1024^2; digits=1) + @info "Downloaded $downloaded_mb MB" + end + return nothing + end + + @info "Downloading $filename from $repo via HuggingFaceHub..." + try + localpath = nothing + try + localpath = HF.file_download(dataset, filename; progress = progress_fn) + catch inner + if inner isa MethodError + @warn "HuggingFaceHub.file_download does not support progress callback on this version; falling back to no-progress call." + localpath = HF.file_download(dataset, filename) + else + rethrow(inner) + end + end + @info "Downloaded to $localpath" + catch e + msg = string(e) + if occursin("symlink", msg) || occursin("creating symlinks", msg) || occursin("Administrator", msg) || occursin("operation not permitted", msg) + @warn "Symlink creation failed (likely Windows privilege). Falling back to direct HTTP download: $e" + url = "https://huggingface.co/datasets/$(repo)/resolve/main/$(filename)" + tmpdir = mktempdir() + dest = joinpath(tmpdir, filename) + @info "Downloading $url -> $dest (no symlink)" + try + Downloads.download(url, dest; progress = progress_fn) + localpath = dest + @info "Fallback download complete: $localpath" + catch e2 + rethrow(e2) + end + else + rethrow(e) + end + end + + url = "https://huggingface.co/datasets/$(repo)/resolve/main/$(filename)" + dep = DataDep( + name, + "Dataset from Hugging Face repository $(repo).", + url; + post_fetch_method = p -> isfile(p) ? p : joinpath(p, filename) + ) + + try + register(dep) + catch e + @warn "DataDep registration failed or already registered: $(e)" + end + + return localpath +end + +""" + load(name::String) + +Simple dispatcher that maps known dataset names to repo + filename, +ensures download/registration, and returns local path. +""" +function load(name::String) + if name == "synthea_1M_3YR" + repo = "JuliaHealthOrg/JuliaHealthDatasets" + filename = "synthea_1M_3YR.duckdb" + else + throw(ArgumentError("Unknown dataset: $name")) + end + + return _huggingface_dataset_register(name, repo, filename) +end + +export load, _huggingface_dataset_register \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..846efe1 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,20 @@ +using Test +using Pkg +using HealthSampleData + +@testset "huggingface helper tests" begin + @test_throws ArgumentError HealthSampleData.load("nonexistent_dataset") + + if get(ENV, "HF_INTEGRATION", "0") == "1" + @info "Running HF integration test (will download from Hugging Face)" + path = nothing + try + path = HealthSampleData.load("synthea_1M_3YR") + @test ispath(path) + finally + @info "HF integration test result path: $path" + end + else + @info "Skipping HF integration test (set HF_INTEGRATION=1 to enable)" + end +end \ No newline at end of file From 36cdc8468093d8808d736b0a2bca6aa7de6825fc Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Sat, 1 Nov 2025 09:04:18 +0530 Subject: [PATCH 02/15] Updates as per review --- Project.toml | 3 -- src/HealthSampleData.jl | 7 ++- src/HuggingFaceDatasets/data.jl | 21 +++++++++ src/huggingface.jl | 77 ++++----------------------------- src/utilities.jl | 31 +++++++++++++ test/runtests.jl | 13 +++++- 6 files changed, 75 insertions(+), 77 deletions(-) create mode 100644 src/HuggingFaceDatasets/data.jl create mode 100644 src/utilities.jl diff --git a/Project.toml b/Project.toml index edc6328..7c4cb70 100644 --- a/Project.toml +++ b/Project.toml @@ -5,12 +5,9 @@ version = "0.0.1" [deps] DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" -Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" HuggingFaceHub = "d0076355-e2c0-48e6-a044-05906e51b7fc" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] diff --git a/src/HealthSampleData.jl b/src/HealthSampleData.jl index 51756cf..c333a66 100644 --- a/src/HealthSampleData.jl +++ b/src/HealthSampleData.jl @@ -3,11 +3,10 @@ module HealthSampleData using DataDeps using HuggingFaceHub using Logging - using FilePathsBase: isfile, joinpath - using Downloads - using Random + - include("huggingface.jl") + include("utilities.jl") include("OMOP_Common_Data_Model/data.jl") + include("HuggingFaceDatasets/data.jl") end diff --git a/src/HuggingFaceDatasets/data.jl b/src/HuggingFaceDatasets/data.jl new file mode 100644 index 0000000..0a44e46 --- /dev/null +++ b/src/HuggingFaceDatasets/data.jl @@ -0,0 +1,21 @@ +using HealthSampleData + +""" + register_huggingface_dataset(name::String, repo::String, filename::String) + +Registers a dataset from HuggingFace as a DataDep and returns the local path. +""" +function register_huggingface_dataset(name::String, repo::String, filename::String) + localpath = HealthSampleData._huggingface_dataset_register(name, repo, filename) + + register(DataDep( + name, + "Dataset from Hugging Face repository $(repo).", + "https://huggingface.co/datasets/$(repo)/resolve/main/$(filename)"; + fetch_method = p -> localpath + )) + + return localpath +end + +export register_huggingface_dataset \ No newline at end of file diff --git a/src/huggingface.jl b/src/huggingface.jl index e306de8..3ce7882 100644 --- a/src/huggingface.jl +++ b/src/huggingface.jl @@ -1,47 +1,23 @@ +using HealthSampleData + const HF = HuggingFaceHub """ _huggingface_dataset_register(name::String, repo::String, filename::String) Resolve dataset metadata from Hugging Face, download `filename` via HuggingFaceHub, -register a DataDep pointing at the HF URL, and return the local filesystem path -to the downloaded file. Displays download progress when possible. +and return the local filesystem path to the downloaded file. Displays download progress when possible. """ function _huggingface_dataset_register(name::String, repo::String, filename::String) @info "Resolving Huggingface Metadata for $repo" dataset = HF.info(HF.Dataset, repo) last_pct = Ref(-1) - progress_fn = function(downloaded::Integer, total::Integer) - if total > 0 - safe_downloaded = min(downloaded, total) - pct = clamp(Int(floor(100 * safe_downloaded / total)), 0, 100) - if pct != last_pct[] - last_pct[] = pct - downloaded_mb = round(safe_downloaded / 1024^2; digits=1) - total_mb = round(total / 1024^2; digits=1) - @info "Download progress: $pct% ($downloaded_mb MB / $total_mb MB)" - end - else - downloaded_mb = round(downloaded / 1024^2; digits=1) - @info "Downloaded $downloaded_mb MB" - end - return nothing - end + progress_fn = (downloaded, total) -> HealthSampleData.utilities.progress_callback(downloaded, total, last_pct) @info "Downloading $filename from $repo via HuggingFaceHub..." try - localpath = nothing - try - localpath = HF.file_download(dataset, filename; progress = progress_fn) - catch inner - if inner isa MethodError - @warn "HuggingFaceHub.file_download does not support progress callback on this version; falling back to no-progress call." - localpath = HF.file_download(dataset, filename) - else - rethrow(inner) - end - end + localpath = HF.file_download(dataset, filename; progress = progress_fn) @info "Downloaded to $localpath" catch e msg = string(e) @@ -51,50 +27,15 @@ function _huggingface_dataset_register(name::String, repo::String, filename::Str tmpdir = mktempdir() dest = joinpath(tmpdir, filename) @info "Downloading $url -> $dest (no symlink)" - try - Downloads.download(url, dest; progress = progress_fn) - localpath = dest - @info "Fallback download complete: $localpath" - catch e2 - rethrow(e2) - end + Downloads.download(url, dest; progress = progress_fn) + localpath = dest + @info "Fallback download complete: $localpath" else rethrow(e) end end - url = "https://huggingface.co/datasets/$(repo)/resolve/main/$(filename)" - dep = DataDep( - name, - "Dataset from Hugging Face repository $(repo).", - url; - post_fetch_method = p -> isfile(p) ? p : joinpath(p, filename) - ) - - try - register(dep) - catch e - @warn "DataDep registration failed or already registered: $(e)" - end - return localpath end -""" - load(name::String) - -Simple dispatcher that maps known dataset names to repo + filename, -ensures download/registration, and returns local path. -""" -function load(name::String) - if name == "synthea_1M_3YR" - repo = "JuliaHealthOrg/JuliaHealthDatasets" - filename = "synthea_1M_3YR.duckdb" - else - throw(ArgumentError("Unknown dataset: $name")) - end - - return _huggingface_dataset_register(name, repo, filename) -end - -export load, _huggingface_dataset_register \ No newline at end of file +export _huggingface_dataset_register \ No newline at end of file diff --git a/src/utilities.jl b/src/utilities.jl new file mode 100644 index 0000000..f6f980d --- /dev/null +++ b/src/utilities.jl @@ -0,0 +1,31 @@ +""" + progress_callback(downloaded::Integer, total::Integer, last_pct::Ref{Int}) + +A utility function to display download progress. It calculates the percentage of +data downloaded and logs the progress in MB and percentage. If the total size is +unknown, it logs the downloaded size in MB. + +# Arguments +- `downloaded::Integer`: The number of bytes downloaded so far. +- `total::Integer`: The total number of bytes to be downloaded. If unknown, pass 0. +- `last_pct::Ref{Int}`: A reference to the last logged percentage to avoid redundant logs. + +# Returns +- `nothing` +""" +function progress_callback(downloaded::Integer, total::Integer, last_pct::Ref{Int}) + if total > 0 + safe_downloaded = min(downloaded, total) + pct = clamp(Int(floor(100 * safe_downloaded / total)), 0, 100) + if pct != last_pct[] + last_pct[] = pct + downloaded_mb = round(safe_downloaded / 1024^2; digits=1) + total_mb = round(total / 1024^2; digits=1) + @info "Download progress: $pct% ($downloaded_mb MB / $total_mb MB)" + end + else + downloaded_mb = round(downloaded / 1024^2; digits=1) + @info "Downloaded $downloaded_mb MB" + end + return nothing +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 4c92fd3..347482c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,15 @@ using HealthSampleData using Test -@testset "HealthSampleData.jl" begin - # Write your tests here. +@testset "Utilities" begin + @testset "progress_callback" begin + last_pct = Ref(-1) + @test HealthSampleData.progress_callback(512 * 1024^2, 1024 * 1024^2, last_pct) === nothing + @test last_pct[] == 50 # 50% progress + + @test HealthSampleData.progress_callback(1024 * 1024^2, 1024 * 1024^2, last_pct) === nothing + @test last_pct[] == 100 # 100% progress + + @test HealthSampleData.progress_callback(512 * 1024^2, 0, last_pct) === nothing # Unknown total size + end end From 5e3e5decb664d150edf018a63af60a14a87010b5 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 10 Nov 2025 20:43:50 +0530 Subject: [PATCH 03/15] Begin writing documentation --- docs/src/index.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/src/index.md b/docs/src/index.md index 4d69448..4edcb9f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -4,6 +4,14 @@ CurrentModule = HealthSampleData # HealthSampleData +> To provide consistent data sets for teaching and learning across JuliaHealth. + +Welcome to HealthSampleData.jl! +This package curates and provisions a number of datasets useful in health informatics, public health, medical imaging, and machine learning research. +It is made in an effort to provide learning resources that are consistently available across JuliaHealth. +The focus of `HealthSampleData.jl` is specifically on downloading, unpacking, and accessing benchmark dataset. +Functionality for the purpose of data processing or visualization is only provided to a degree that is special to some dataset. + Documentation for [HealthSampleData](https://github.com/TheCedarPrince/HealthSampleData.jl). ```@index From cc52ddf5e1395b4b8ee9a2a49def065788c748c7 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 10 Nov 2025 20:44:18 +0530 Subject: [PATCH 04/15] First draft of HuggingFace downloading --- src/HealthSampleData.jl | 1 - src/HuggingFaceDatasets/data.jl | 63 ++++++++++++++++++++++++++------- 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/HealthSampleData.jl b/src/HealthSampleData.jl index c333a66..a3d08de 100644 --- a/src/HealthSampleData.jl +++ b/src/HealthSampleData.jl @@ -4,7 +4,6 @@ module HealthSampleData using HuggingFaceHub using Logging - include("utilities.jl") include("OMOP_Common_Data_Model/data.jl") include("HuggingFaceDatasets/data.jl") diff --git a/src/HuggingFaceDatasets/data.jl b/src/HuggingFaceDatasets/data.jl index 0a44e46..40723b8 100644 --- a/src/HuggingFaceDatasets/data.jl +++ b/src/HuggingFaceDatasets/data.jl @@ -1,21 +1,60 @@ -using HealthSampleData +function Synthea() + localpath = HealthSampleData._huggingface_dataset_register("Synthea", "JuliaHealthOrg/JuliaHealthDatasets", "synthea_1M_3YR.duckdb") + register(DataDep( + "Synthea", + "1 million patients each with 3 year retrospective medical histories generated using the Synthea data generator (https://synthea.mitre.org). DuckDB database following the OMOP Common Data Model layout.", + "https://huggingface.co/datasets/JuliaHealthOrg/JuliaHealthDatasets/blob/main/synthea_1M_3YR.duckdb"; + fetch_method = p -> localpath + )) -""" - register_huggingface_dataset(name::String, repo::String, filename::String) + datadep"Synthea" + + @info "Synthea data source is downloaded!" + + return datadep"Synthea/synthea_1M_3YR.duckdb" +end -Registers a dataset from HuggingFace as a DataDep and returns the local path. -""" -function register_huggingface_dataset(name::String, repo::String, filename::String) - localpath = HealthSampleData._huggingface_dataset_register(name, repo, filename) +function Test() + localpath = HealthSampleData._huggingface_dataset_register("Test", "JuliaHealthOrg/JuliaHealthDatasets", "penguins.csv") register(DataDep( - name, - "Dataset from Hugging Face repository $(repo).", - "https://huggingface.co/datasets/$(repo)/resolve/main/$(filename)"; + "Test", + """ + The Palmer Penguins test dataset for HealthSampleData.jl. To cite: + + Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer + Archipelago (Antarctica) penguin data. R package version 0.1.0. + https://allisonhorst.github.io/palmerpenguins/. doi: + 10.5281/zenodo.3960218. + + """, + "https://huggingface.co/datasets/JuliaHealthOrg/JuliaHealthDatasets/blob/main/penguins.csv"; fetch_method = p -> localpath )) - return localpath + datadep"Test" + + @info "Test data source is downloaded!" + + return datadep"Test/test_data.duckdb" +end + +""" + register_huggingface_dataset(name::String) + +Registers a dataset from HuggingFace as a DataDep and returns the local path. +""" +function download_hf_dataset(name::String) + if name == "Synthea" + @info "Downloading Synthea dataset as DataDep..." + return Synthea() + elseif + name == "Test" + @info "Downloading Test dataset as DataDep..." + return Test() + else + error("Dataset registration for $name is not implemented.") + end end -export register_huggingface_dataset \ No newline at end of file +export download_hf_dataset \ No newline at end of file From 525541bde9d03800e50fa5615e91ac0f3ce2b42b Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 10 Nov 2025 20:44:35 +0530 Subject: [PATCH 05/15] First draft of tests --- test/runtests.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 347482c..d469014 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,3 +13,12 @@ using Test @test HealthSampleData.progress_callback(512 * 1024^2, 0, last_pct) === nothing # Unknown total size end end + +@testset "HuggingFaceDatasets - Test dataset" begin + @test isa(HealthSampleData.Test, Function) + @test hasmethod(HealthSampleData.Test, Tuple{}) + + @test isa(HealthSampleData.download_hf_dataset, Function) + + @test_throws ErrorException HealthSampleData.download_hf_dataset("NonExistentDataset12345") +end From 47807d28b5cfd2a6fde23d5436f95e166726ed55 Mon Sep 17 00:00:00 2001 From: TheCedarPrince Date: Mon, 10 Nov 2025 10:32:44 -0500 Subject: [PATCH 06/15] Back up for Param's HF branch --- Project.toml | 8 ------- src/HuggingFaceDatasets/data.jl | 5 ++-- test/Manifest.toml | 41 +++++++++++++++++++++++++++++++++ test/Project.toml | 2 ++ 4 files changed, 45 insertions(+), 11 deletions(-) create mode 100644 test/Manifest.toml create mode 100644 test/Project.toml diff --git a/Project.toml b/Project.toml index 7c4cb70..79498d9 100644 --- a/Project.toml +++ b/Project.toml @@ -7,15 +7,7 @@ version = "0.0.1" DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" HuggingFaceHub = "d0076355-e2c0-48e6-a044-05906e51b7fc" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" -Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] DataDeps = "0.7" julia = "1.10" - -[extras] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[targets] -test = ["Test"] diff --git a/src/HuggingFaceDatasets/data.jl b/src/HuggingFaceDatasets/data.jl index 40723b8..1ffa62f 100644 --- a/src/HuggingFaceDatasets/data.jl +++ b/src/HuggingFaceDatasets/data.jl @@ -48,8 +48,7 @@ function download_hf_dataset(name::String) if name == "Synthea" @info "Downloading Synthea dataset as DataDep..." return Synthea() - elseif - name == "Test" + elseif name == "Test" @info "Downloading Test dataset as DataDep..." return Test() else @@ -57,4 +56,4 @@ function download_hf_dataset(name::String) end end -export download_hf_dataset \ No newline at end of file +export download_hf_dataset diff --git a/test/Manifest.toml b/test/Manifest.toml new file mode 100644 index 0000000..6cb80db --- /dev/null +++ b/test/Manifest.toml @@ -0,0 +1,41 @@ +# This file is machine-generated - editing it directly is not advised + +julia_version = "1.11.7" +manifest_format = "2.0" +project_hash = "71d91126b5a1fb1020e1098d9d492de2a4438fd2" + +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" +version = "1.11.0" + +[[deps.InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +version = "1.11.0" + +[[deps.Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" +version = "1.11.0" + +[[deps.Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" +version = "1.11.0" + +[[deps.Random]] +deps = ["SHA"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +version = "1.11.0" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +version = "1.11.0" + +[[deps.Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +version = "1.11.0" diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..0c36332 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,2 @@ +[deps] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" From 3acfe4d9912536110b090497457daf0ee6083949 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 10 Nov 2025 21:40:45 +0530 Subject: [PATCH 07/15] First draft at making quick start guide --- docs/src/index.md | 31 ++++++++++++++++++--------- docs/src/quick_start.md | 39 ++++++++++++++++++++++++++++++++++ docs/src/supported_datasets.md | 17 +++++++++++++++ 3 files changed, 77 insertions(+), 10 deletions(-) create mode 100644 docs/src/quick_start.md create mode 100644 docs/src/supported_datasets.md diff --git a/docs/src/index.md b/docs/src/index.md index 4edcb9f..23dedfa 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -6,17 +6,28 @@ CurrentModule = HealthSampleData > To provide consistent data sets for teaching and learning across JuliaHealth. -Welcome to HealthSampleData.jl! +Welcome to `HealthSampleData.jl`! + This package curates and provisions a number of datasets useful in health informatics, public health, medical imaging, and machine learning research. -It is made in an effort to provide learning resources that are consistently available across JuliaHealth. -The focus of `HealthSampleData.jl` is specifically on downloading, unpacking, and accessing benchmark dataset. -Functionality for the purpose of data processing or visualization is only provided to a degree that is special to some dataset. +It is made in an effort to provide learning resources that are consistently available across JuliaHealth. -Documentation for [HealthSampleData](https://github.com/TheCedarPrince/HealthSampleData.jl). +## Dataset Overview -```@index -``` +`HealthSampleData.jl` uses `DataDeps.jl` to download data sources from a variety of locations. +Each dataset provides: -```@autodocs -Modules = [HealthSampleData] -``` +1. A short description +2. Relevant links or resources +3. Its file type (e.g. CSV, sqlite, etc.) +4. A quickstart guide +5. Where it is being downloaded from + +> **NOTE:** For more information about datasets and data sources, please refer to [Supported Datasets](./supported_datasets). + +## Installation + +To install `HealthSampleData.jl`, type the following snippet into the Julia REPL: + +```julia +Pkg.add("HealthSampleData.jl") +``` \ No newline at end of file diff --git a/docs/src/quick_start.md b/docs/src/quick_start.md new file mode 100644 index 0000000..b5bc5ab --- /dev/null +++ b/docs/src/quick_start.md @@ -0,0 +1,39 @@ +# Quick Start Guide + +Here is a complete example workflow for how someone would want to use `HealthSampleData.jl`. + +## Installation + +To install `HealthSampleData.jl`, type the following snippet into the Julia REPL: + +```julia +Pkg.add("HealthSampleData.jl") +``` + +## Download a Dataset + +We'll download a small dataset: + +```julia +import HealthSampleData: + Test + +Test() +``` + +You should see something like the following: + +```text +This program has requested access to the data dependency Test. +which is not currently installed. It can be installed automatically, and you will not see this message again. + +The Palmer Penguins test dataset for HealthSampleData.jl. To cite: + +Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer +Archipelago (Antarctica) penguin data. R package version 0.1.0. +https://allisonhorst.github.io/palmerpenguins/. doi: +10.5281/zenodo.3960218. + +Do you want to download the dataset from https://huggingface.co/datasets/JuliaHealthOrg/JuliaHealthDatasets/penguins.csv to "C:\Users\You\.julia\scratchspaces\[UUID]\datadeps\Test"? +[y/n] +``` \ No newline at end of file diff --git a/docs/src/supported_datasets.md b/docs/src/supported_datasets.md new file mode 100644 index 0000000..5631beb --- /dev/null +++ b/docs/src/supported_datasets.md @@ -0,0 +1,17 @@ +# Supported Datasets + +`HealthSampleData.jl` supports a variety of datasets from various sources. +Here is an overview of available datasets grouped loosely by domain: + +## Patient Medical Records + +```@docs +Eunomia +Synthea +``` + +## Miscellaneous Data Sets + +```@docs +Test +``` \ No newline at end of file From 23d0566dd5b7ee2d11bbeb40176e1af60965d0e0 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 10 Nov 2025 21:41:49 +0530 Subject: [PATCH 08/15] Added Changes to huggingface.jl files --- src/HealthSampleData.jl | 1 + src/HuggingFaceDatasets/data.jl | 11 ++++---- src/huggingface.jl | 47 ++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/src/HealthSampleData.jl b/src/HealthSampleData.jl index a3d08de..fd4e760 100644 --- a/src/HealthSampleData.jl +++ b/src/HealthSampleData.jl @@ -5,6 +5,7 @@ module HealthSampleData using Logging include("utilities.jl") + include("huggingface.jl") include("OMOP_Common_Data_Model/data.jl") include("HuggingFaceDatasets/data.jl") diff --git a/src/HuggingFaceDatasets/data.jl b/src/HuggingFaceDatasets/data.jl index 1ffa62f..4287044 100644 --- a/src/HuggingFaceDatasets/data.jl +++ b/src/HuggingFaceDatasets/data.jl @@ -4,7 +4,8 @@ function Synthea() "Synthea", "1 million patients each with 3 year retrospective medical histories generated using the Synthea data generator (https://synthea.mitre.org). DuckDB database following the OMOP Common Data Model layout.", "https://huggingface.co/datasets/JuliaHealthOrg/JuliaHealthDatasets/blob/main/synthea_1M_3YR.duckdb"; - fetch_method = p -> localpath + # fetch_method gets called as (remotepath, localdir) by DataDeps + fetch_method = (remotepath, localdir) -> localpath )) datadep"Synthea" @@ -26,10 +27,10 @@ function Test() Archipelago (Antarctica) penguin data. R package version 0.1.0. https://allisonhorst.github.io/palmerpenguins/. doi: 10.5281/zenodo.3960218. - + """, - "https://huggingface.co/datasets/JuliaHealthOrg/JuliaHealthDatasets/blob/main/penguins.csv"; - fetch_method = p -> localpath + "https://huggingface.co/datasets/JuliaHealthOrg/JuliaHealthDatasets/penguins.csv"; + fetch_method = (remotepath, localdir) -> localpath )) datadep"Test" @@ -56,4 +57,4 @@ function download_hf_dataset(name::String) end end -export download_hf_dataset +export download_hf_dataset \ No newline at end of file diff --git a/src/huggingface.jl b/src/huggingface.jl index 3ce7882..a33048c 100644 --- a/src/huggingface.jl +++ b/src/huggingface.jl @@ -1,4 +1,4 @@ -using HealthSampleData +using Downloads const HF = HuggingFaceHub @@ -9,33 +9,60 @@ Resolve dataset metadata from Hugging Face, download `filename` via HuggingFaceH and return the local filesystem path to the downloaded file. Displays download progress when possible. """ function _huggingface_dataset_register(name::String, repo::String, filename::String) - @info "Resolving Huggingface Metadata for $repo" - dataset = HF.info(HF.Dataset, repo) + # Ensure full Hugging Face dataset URL + if !startswith(repo, "http") + repo = "https://huggingface.co/datasets/$(repo)" + end + + @info "Resolving Hugging Face metadata for $repo" + # Try fetching dataset info safely + dataset = try + HF.info(HF.Dataset, repo) + catch e + @warn "Failed to resolve dataset metadata: $e. Falling back to direct download." + nothing + end + + # Set up progress callback last_pct = Ref(-1) - progress_fn = (downloaded, total) -> HealthSampleData.utilities.progress_callback(downloaded, total, last_pct) + progress_fn = (downloaded, total) -> progress_callback(downloaded, total, last_pct) @info "Downloading $filename from $repo via HuggingFaceHub..." try - localpath = HF.file_download(dataset, filename; progress = progress_fn) + # Prefer official HuggingFaceHub download if dataset info is available + if dataset !== nothing + localpath = HF.file_download(dataset, filename; progress = progress_fn) + else + # Direct fallback if HF.info failed + url = "$repo/resolve/main/$filename" + tmpdir = mktempdir() + dest = joinpath(tmpdir, filename) + @info "Downloading $url -> $dest" + Downloads.download(url, dest; progress = progress_fn) + localpath = dest + end @info "Downloaded to $localpath" + return localpath + catch e msg = string(e) - if occursin("symlink", msg) || occursin("creating symlinks", msg) || occursin("Administrator", msg) || occursin("operation not permitted", msg) - @warn "Symlink creation failed (likely Windows privilege). Falling back to direct HTTP download: $e" - url = "https://huggingface.co/datasets/$(repo)/resolve/main/$(filename)" + if occursin("symlink", msg) || occursin("creating symlinks", msg) || + occursin("Administrator", msg) || occursin("operation not permitted", msg) + + @warn "Symlink creation failed (likely Windows privilege issue). Falling back to direct HTTP download: $e" + url = "$repo/resolve/main/$filename" tmpdir = mktempdir() dest = joinpath(tmpdir, filename) @info "Downloading $url -> $dest (no symlink)" Downloads.download(url, dest; progress = progress_fn) localpath = dest @info "Fallback download complete: $localpath" + return localpath else rethrow(e) end end - - return localpath end export _huggingface_dataset_register \ No newline at end of file From 595023385944f1693faa7b251974cd6e2119605d Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 10 Nov 2025 21:42:04 +0530 Subject: [PATCH 09/15] Modified Project.toml --- Project.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 79498d9..44c0e61 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,15 @@ name = "HealthSampleData" uuid = "85295614-c7c2-47eb-9e31-7664d7fbe6db" -authors = ["TheCedarPrince , ParamThakkar123 and contributors"] version = "0.0.1" +authors = ["TheCedarPrince , ParamThakkar123 and contributors"] [deps] DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" HuggingFaceHub = "d0076355-e2c0-48e6-a044-05906e51b7fc" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" [compat] DataDeps = "0.7" +Downloads = "1.6.0" julia = "1.10" From 4802009a171a64e84502e390bc485ed00c02253b Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Tue, 11 Nov 2025 16:45:38 +0530 Subject: [PATCH 10/15] Added fix for download failures --- src/HuggingFaceDatasets/data.jl | 39 +++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/HuggingFaceDatasets/data.jl b/src/HuggingFaceDatasets/data.jl index 4287044..5512656 100644 --- a/src/HuggingFaceDatasets/data.jl +++ b/src/HuggingFaceDatasets/data.jl @@ -1,3 +1,13 @@ +# helper: ensure a stable per-user cache for immediate returns (avoids datadep interactive prompts) +function _cache_and_return(localpath::String, name::String, filename::String) + cache_dir = joinpath(homedir(), ".julia", "HealthSampleData_datasets", name) + mkpath(cache_dir) + dest = joinpath(cache_dir, filename) + cp(localpath, dest; force=true) + @info "$name dataset cached at $dest" + return dest +end + function Synthea() localpath = HealthSampleData._huggingface_dataset_register("Synthea", "JuliaHealthOrg/JuliaHealthDatasets", "synthea_1M_3YR.duckdb") register(DataDep( @@ -5,14 +15,17 @@ function Synthea() "1 million patients each with 3 year retrospective medical histories generated using the Synthea data generator (https://synthea.mitre.org). DuckDB database following the OMOP Common Data Model layout.", "https://huggingface.co/datasets/JuliaHealthOrg/JuliaHealthDatasets/blob/main/synthea_1M_3YR.duckdb"; # fetch_method gets called as (remotepath, localdir) by DataDeps - fetch_method = (remotepath, localdir) -> localpath + fetch_method = (remotepath, localdir) -> begin + mkpath(localdir) + dest = joinpath(localdir, "synthea_1M_3YR.duckdb") + cp(localpath, dest; force=true) + return dest + end )) - datadep"Synthea" - - @info "Synthea data source is downloaded!" - - return datadep"Synthea/synthea_1M_3YR.duckdb" + # Do not call datadep"Synthea" directly (avoids interactive prompt when a stale datadep dir exists). + # Instead, cache the downloaded file in a per-user location and return that path immediately. + return _cache_and_return(localpath, "Synthea", "synthea_1M_3YR.duckdb") end @@ -30,14 +43,16 @@ function Test() """, "https://huggingface.co/datasets/JuliaHealthOrg/JuliaHealthDatasets/penguins.csv"; - fetch_method = (remotepath, localdir) -> localpath + fetch_method = (remotepath, localdir) -> begin + mkpath(localdir) + dest = joinpath(localdir, "penguins.csv") + cp(localpath, dest; force=true) + return dest + end )) - datadep"Test" - - @info "Test data source is downloaded!" - - return datadep"Test/test_data.duckdb" + # Avoid calling datadep"Test" here to prevent DataDeps interactive prompt on stale installs. + return _cache_and_return(localpath, "Test", "penguins.csv") end """ From 9047a363aefe25a7c78c77203c2948b42f463069 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Wed, 12 Nov 2025 00:11:27 +0530 Subject: [PATCH 11/15] Removed tests for progress callback --- test/runtests.jl | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d469014..4333764 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,16 +2,6 @@ using HealthSampleData using Test @testset "Utilities" begin - @testset "progress_callback" begin - last_pct = Ref(-1) - @test HealthSampleData.progress_callback(512 * 1024^2, 1024 * 1024^2, last_pct) === nothing - @test last_pct[] == 50 # 50% progress - - @test HealthSampleData.progress_callback(1024 * 1024^2, 1024 * 1024^2, last_pct) === nothing - @test last_pct[] == 100 # 100% progress - - @test HealthSampleData.progress_callback(512 * 1024^2, 0, last_pct) === nothing # Unknown total size - end end @testset "HuggingFaceDatasets - Test dataset" begin From 5a66f41ee85c414598a295037be2d16838d8c650 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Wed, 12 Nov 2025 00:14:03 +0530 Subject: [PATCH 12/15] Simplified dataset logic to HF.info --- src/huggingface.jl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/huggingface.jl b/src/huggingface.jl index a33048c..4c89e48 100644 --- a/src/huggingface.jl +++ b/src/huggingface.jl @@ -17,12 +17,7 @@ function _huggingface_dataset_register(name::String, repo::String, filename::Str @info "Resolving Hugging Face metadata for $repo" # Try fetching dataset info safely - dataset = try - HF.info(HF.Dataset, repo) - catch e - @warn "Failed to resolve dataset metadata: $e. Falling back to direct download." - nothing - end + dataset = HF.info(HF.Dataset, repo) # Set up progress callback last_pct = Ref(-1) From e5d8648907a402f1d808685938f8a1692df287b6 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Wed, 12 Nov 2025 00:17:51 +0530 Subject: [PATCH 13/15] Added some fixes --- src/HuggingFaceDatasets/data.jl | 25 ++++++++++--------------- src/huggingface.jl | 4 ---- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/src/HuggingFaceDatasets/data.jl b/src/HuggingFaceDatasets/data.jl index 5512656..dc4a612 100644 --- a/src/HuggingFaceDatasets/data.jl +++ b/src/HuggingFaceDatasets/data.jl @@ -1,13 +1,3 @@ -# helper: ensure a stable per-user cache for immediate returns (avoids datadep interactive prompts) -function _cache_and_return(localpath::String, name::String, filename::String) - cache_dir = joinpath(homedir(), ".julia", "HealthSampleData_datasets", name) - mkpath(cache_dir) - dest = joinpath(cache_dir, filename) - cp(localpath, dest; force=true) - @info "$name dataset cached at $dest" - return dest -end - function Synthea() localpath = HealthSampleData._huggingface_dataset_register("Synthea", "JuliaHealthOrg/JuliaHealthDatasets", "synthea_1M_3YR.duckdb") register(DataDep( @@ -23,9 +13,11 @@ function Synthea() end )) - # Do not call datadep"Synthea" directly (avoids interactive prompt when a stale datadep dir exists). - # Instead, cache the downloaded file in a per-user location and return that path immediately. - return _cache_and_return(localpath, "Synthea", "synthea_1M_3YR.duckdb") + datadep"Synthea" + + @info "Synthea data source is downloaded!" + + return "Synthea/synthea_1M_3YR.duckdb" end @@ -51,8 +43,11 @@ function Test() end )) - # Avoid calling datadep"Test" here to prevent DataDeps interactive prompt on stale installs. - return _cache_and_return(localpath, "Test", "penguins.csv") + datadep"Test" + + @info "Test data source is downloaded!" + + return "Test/penguins.csv" end """ diff --git a/src/huggingface.jl b/src/huggingface.jl index 4c89e48..0692528 100644 --- a/src/huggingface.jl +++ b/src/huggingface.jl @@ -9,10 +9,6 @@ Resolve dataset metadata from Hugging Face, download `filename` via HuggingFaceH and return the local filesystem path to the downloaded file. Displays download progress when possible. """ function _huggingface_dataset_register(name::String, repo::String, filename::String) - # Ensure full Hugging Face dataset URL - if !startswith(repo, "http") - repo = "https://huggingface.co/datasets/$(repo)" - end @info "Resolving Hugging Face metadata for $repo" From c1707c543b8c9ecc0c938b7c6550d79b8dd3017a Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Wed, 12 Nov 2025 00:37:07 +0530 Subject: [PATCH 14/15] removed progress callback --- src/huggingface.jl | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/huggingface.jl b/src/huggingface.jl index 0692528..3f4336e 100644 --- a/src/huggingface.jl +++ b/src/huggingface.jl @@ -6,7 +6,7 @@ const HF = HuggingFaceHub _huggingface_dataset_register(name::String, repo::String, filename::String) Resolve dataset metadata from Hugging Face, download `filename` via HuggingFaceHub, -and return the local filesystem path to the downloaded file. Displays download progress when possible. +and return the local filesystem path to the downloaded file """ function _huggingface_dataset_register(name::String, repo::String, filename::String) @@ -15,22 +15,18 @@ function _huggingface_dataset_register(name::String, repo::String, filename::Str # Try fetching dataset info safely dataset = HF.info(HF.Dataset, repo) - # Set up progress callback - last_pct = Ref(-1) - progress_fn = (downloaded, total) -> progress_callback(downloaded, total, last_pct) - @info "Downloading $filename from $repo via HuggingFaceHub..." try # Prefer official HuggingFaceHub download if dataset info is available if dataset !== nothing - localpath = HF.file_download(dataset, filename; progress = progress_fn) + localpath = HF.file_download(dataset, filename) else # Direct fallback if HF.info failed url = "$repo/resolve/main/$filename" tmpdir = mktempdir() dest = joinpath(tmpdir, filename) @info "Downloading $url -> $dest" - Downloads.download(url, dest; progress = progress_fn) + Downloads.download(url, dest) localpath = dest end @info "Downloaded to $localpath" @@ -46,7 +42,7 @@ function _huggingface_dataset_register(name::String, repo::String, filename::Str tmpdir = mktempdir() dest = joinpath(tmpdir, filename) @info "Downloading $url -> $dest (no symlink)" - Downloads.download(url, dest; progress = progress_fn) + Downloads.download(url, dest) localpath = dest @info "Fallback download complete: $localpath" return localpath From e8523d361746b5ab836743b5509f5a21eadc2508 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Wed, 12 Nov 2025 00:37:31 +0530 Subject: [PATCH 15/15] removed utilities file --- src/utilities.jl | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 src/utilities.jl diff --git a/src/utilities.jl b/src/utilities.jl deleted file mode 100644 index f6f980d..0000000 --- a/src/utilities.jl +++ /dev/null @@ -1,31 +0,0 @@ -""" - progress_callback(downloaded::Integer, total::Integer, last_pct::Ref{Int}) - -A utility function to display download progress. It calculates the percentage of -data downloaded and logs the progress in MB and percentage. If the total size is -unknown, it logs the downloaded size in MB. - -# Arguments -- `downloaded::Integer`: The number of bytes downloaded so far. -- `total::Integer`: The total number of bytes to be downloaded. If unknown, pass 0. -- `last_pct::Ref{Int}`: A reference to the last logged percentage to avoid redundant logs. - -# Returns -- `nothing` -""" -function progress_callback(downloaded::Integer, total::Integer, last_pct::Ref{Int}) - if total > 0 - safe_downloaded = min(downloaded, total) - pct = clamp(Int(floor(100 * safe_downloaded / total)), 0, 100) - if pct != last_pct[] - last_pct[] = pct - downloaded_mb = round(safe_downloaded / 1024^2; digits=1) - total_mb = round(total / 1024^2; digits=1) - @info "Download progress: $pct% ($downloaded_mb MB / $total_mb MB)" - end - else - downloaded_mb = round(downloaded / 1024^2; digits=1) - @info "Downloaded $downloaded_mb MB" - end - return nothing -end \ No newline at end of file