diff --git a/src/components/App/App.tsx b/src/components/App/App.tsx index 2181cb6f..3a539adc 100644 --- a/src/components/App/App.tsx +++ b/src/components/App/App.tsx @@ -1,6 +1,7 @@ import { useMemo } from 'react' import { Config, ConfigProvider } from '../../hooks/useConfig.js' import { getHttpSource } from '../../lib/sources/httpSource.js' +import { getHuggingFaceSource } from '../../lib/sources/huggingFaceSource.js' import { getHyperparamSource } from '../../lib/sources/hyperparamSource.js' import Page from '../Page/Page.js' @@ -10,7 +11,9 @@ export default function App() { const row = search.get('row') === null ? undefined : Number(search.get('row')) const col = search.get('col') === null ? undefined : Number(search.get('col')) - const source = getHttpSource(sourceId) ?? getHyperparamSource(sourceId, { endpoint: location.origin }) + const source = getHuggingFaceSource(sourceId) ?? + getHttpSource(sourceId) ?? + getHyperparamSource(sourceId, { endpoint: location.origin }) // Memoize the config to avoid creating a new object on each render const config: Config = useMemo(() => ({ diff --git a/src/components/Breadcrumb/Breadcrumb.module.css b/src/components/Breadcrumb/Breadcrumb.module.css index 34dcb4c6..f9e6df98 100644 --- a/src/components/Breadcrumb/Breadcrumb.module.css +++ b/src/components/Breadcrumb/Breadcrumb.module.css @@ -15,7 +15,7 @@ min-height: 32px; border-bottom: 1px solid #ddd; background: var(--color-background-dark); - padding: 0 10px 0 20px; + padding: 0 20px; border-radius: var(--border-radius-lg); margin: var(--space-3xs); /* TODO(SL): forbid overflow? */ @@ -56,6 +56,13 @@ .versions { padding-left: 4px; + & > button { + color: #eee; + + &:hover, &:focus { + color: #fff + } + } [aria-current] { font-weight: bold; diff --git a/src/lib/sources/huggingFaceSource.ts b/src/lib/sources/huggingFaceSource.ts new file mode 100644 index 00000000..f3bef842 --- /dev/null +++ b/src/lib/sources/huggingFaceSource.ts @@ -0,0 +1,360 @@ +import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js' +import { getFileName } from './utils.js' + +type RepoType = 'model' | 'dataset' | 'space' + +interface BaseUrl { + source: string + origin: string + type: RepoType + repo: string + branch: string + path: string +} + +interface DirectoryUrl extends BaseUrl { + kind: 'directory' + action: 'tree' +} + +interface FileUrl extends BaseUrl { + kind: 'file' + action: 'resolve' | 'blob' + resolveUrl: string +} + +type HFUrl = DirectoryUrl | FileUrl; + +interface RefResponse { + name: string; + ref: string; + targetCommit: string; +} + +const refTypes = [ + 'branches', + 'tags', + 'converts', + 'pullRequests', +] as const +type RefType = (typeof refTypes)[number]; +type RefsResponse = Partial>; + +interface RefMetadata extends RefResponse { + refType: RefType; // TODO(SL): use it to style the refs differently? +} + +const baseUrl = 'https://huggingface.co' + +function getFullName(url: HFUrl): string { + return url.type === 'dataset' ? `datasets/${url.repo}` : url.type === 'space' ? `spaces/${url.repo}` : url.repo +} +function getSourceParts(url: HFUrl): SourcePart[] { + const fullName = getFullName(url) + const sourceParts: SourcePart[] = [{ + sourceId: `${baseUrl}/${fullName}/tree/${url.branch}/`, + text: `${baseUrl}/${fullName}/${url.action}/${url.branch}/`, + }] + + const pathParts = url.path.split('/').filter(d => d.length > 0) + const lastPart = pathParts.at(-1) + if (lastPart) { + for (const [i, part] of pathParts.slice(0, -1).entries()) { + sourceParts.push({ + sourceId: `${baseUrl}/${fullName}/tree/${url.branch}/${pathParts.slice(0, i + 1).join('/')}`, + text: part + '/', + }) + } + sourceParts.push({ + sourceId: `${baseUrl}/${fullName}/${url.action}/${url.branch}${url.path}`, + text: lastPart, + }) + } + return sourceParts +} +function getPrefix(url: DirectoryUrl): string { + return `${url.origin}/${getFullName(url)}/tree/${url.branch}${url.path}`.replace(/\/$/, '') +} +async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: RequestInit, accessToken?: string }): Promise { + const repoFullName = getFullName(url) + const filesIterator = listFiles({ + repoFullName, + revision: url.branch, + path: 'path' in url ? url.path.replace(/^\//, '') : '', // remove leading slash if any + accessToken: options?.accessToken, + }) + const files: FileMetadata[] = [] + for await (const file of filesIterator) { + files.push({ + name: getFileName(file.path), + eTag: file.lastCommit?.id, + size: file.size, + lastModified: file.lastCommit?.date, + sourceId: `${url.origin}/${getFullName(url)}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''), + kind: file.type === 'file' ? 'file' : 'directory', // 'unknown' is considered as a directory + }) + } + return files +} +export function getHuggingFaceSource(sourceId: string, options?: {requestInit?: RequestInit, accessToken?: string}): FileSource | DirSource | undefined { + try { + const url = parseHuggingFaceUrl(sourceId) + async function fetchVersions() { + const refsList = await fetchRefsList(url, options) + return { + label: 'Branches', + versions: refsList.map(({ refType, name, ref }) => { + const label = refType === 'branches' ? name : + refType === 'converts' ? `[convert] ${name}` : + refType === 'tags' ? `[tag] ${name}` : + `[pr] ${name}` + // remove refs/heads/ from the ref name + // e.g. refs/heads/main -> main + const fixedRef = refType === 'branches' ? ref.replace(/refs\/heads\//, '') : ref + const branchSourceId = `${url.origin}/${getFullName(url)}/${url.kind === 'file' ? 'blob' : 'tree'}/${fixedRef}${url.path}` + return { + label, + sourceId: branchSourceId, + } + }), + } + } + if (url.kind === 'file') { + return { + kind: 'file', + sourceId, + sourceParts: getSourceParts(url), + fileName: getFileName(url.path), + resolveUrl: url.resolveUrl, + requestInit: options?.requestInit, + fetchVersions, + } + } else { + return { + kind: 'directory', + sourceId, + sourceParts: getSourceParts(url), + prefix: getPrefix(url), + listFiles: () => fetchFilesList(url, options), + fetchVersions, + } + } + } catch { + return undefined + } +} + +export function parseHuggingFaceUrl(url: string): HFUrl { + const urlObject = new URL(url) + // ^ throws 'TypeError: URL constructor: {url} is not a valid URL.' if url is not a valid URL + + if ( + urlObject.protocol !== 'https:' || + ![ + 'huggingface.co', 'huggingface.com', 'hf.co', + // hf.com is not a HF domain + ].includes(urlObject.host) + ) { + throw new Error('Not a Hugging Face URL') + } + + let { pathname } = urlObject + let type: RepoType = 'model' + if (pathname.startsWith('/datasets')) { + type = 'dataset' + pathname = pathname.slice('/datasets'.length) + } else if (pathname.startsWith('/spaces')) { + type = 'space' + pathname = pathname.slice('/spaces'.length) + } + + const repoGroups = /^\/(?[^/]+)\/(?[^/]+)\/?$/.exec( + pathname + )?.groups + if (repoGroups?.namespace !== undefined && repoGroups.repo !== undefined) { + return { + kind: 'directory', + source: url, + origin: urlObject.origin, + type, + repo: repoGroups.namespace + '/' + repoGroups.repo, + action: 'tree', + branch: 'main', // hardcode the default branch + path: '', + } + } + + const folderGroups = + /^\/(?[^/]+)\/(?[^/]+)\/(?tree)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)*)\/?$/.exec( + pathname + )?.groups + if ( + folderGroups?.namespace !== undefined && + folderGroups.repo !== undefined && + folderGroups.action !== undefined && + folderGroups.branch !== undefined && + folderGroups.path !== undefined && + folderGroups.branch !== 'refs' + ) { + const typePath = type === 'dataset' ? '/datasets' : type === 'space' ? '/spaces' : '' + const branch = folderGroups.branch.replace(/\//g, '%2F') + const source = `${urlObject.origin}${typePath}/${folderGroups.namespace}/${folderGroups.repo}/${folderGroups.action}/${branch}${folderGroups.path}` + return { + kind: 'directory', + source, + origin: urlObject.origin, + type, + repo: folderGroups.namespace + '/' + folderGroups.repo, + action: 'tree', + branch, + path: folderGroups.path, + } + } + + const fileGroups = + /^\/(?[^/]+)\/(?[^/]+)\/(?blob|resolve)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)+)$/.exec( + pathname + )?.groups + if ( + fileGroups?.namespace !== undefined && + fileGroups.repo !== undefined && + fileGroups.action !== undefined && + fileGroups.branch !== undefined && + fileGroups.path !== undefined && + fileGroups.branch !== 'refs' + ) { + const typePath = type === 'dataset' ? '/datasets' : type === 'space' ? '/spaces' : '' + const branch = fileGroups.branch.replace(/\//g, '%2F') + const source = `${urlObject.origin}${typePath}/${fileGroups.namespace}/${fileGroups.repo}/${fileGroups.action}/${branch}${fileGroups.path}` + return { + kind: 'file', + source, + origin: urlObject.origin, + type, + repo: fileGroups.namespace + '/' + fileGroups.repo, + action: fileGroups.action === 'blob' ? 'blob' : 'resolve', + branch, + path: fileGroups.path, + resolveUrl: `${urlObject.origin}${typePath}/${fileGroups.namespace}/${fileGroups.repo}/resolve/${branch}${fileGroups.path}`, + } + } + + throw new Error('Unsupported Hugging Face URL') +} + +/** + * List refs in a HF dataset repo + * + * Example API URL: https://huggingface.co/api/datasets/codeparrot/github-code/refs + * + * @param repo (namespace/repo) + * @param [options] + * @param [options.requestInit] - request init object to pass to fetch + * @param [options.accessToken] - access token to use for authentication + * + * @returns the list of branches, tags, pull requests, and converts + */ +async function fetchRefsList( + url: HFUrl, + options?: {requestInit?: RequestInit, accessToken?: string} +): Promise { + if (options?.accessToken && !options.accessToken.startsWith('hf_')) { + throw new TypeError('Your access token must start with \'hf_\'') + } + const headers = new Headers(options?.requestInit?.headers) + headers.set('accept', 'application/json') + if (options?.accessToken) { + headers.set('Authorization', `Bearer ${options.accessToken}`) + } + const response = await fetch(`https://huggingface.co/api/${url.type}s/${url.repo}/refs`, { ...options?.requestInit, headers }) + if (!response.ok) { + throw new Error(`HTTP error ${response.status.toString()}`) + } + const refsByType = await response.json() as RefsResponse + return refTypes.flatMap((refType) => { + const refResponse = refsByType[refType] + if (!refResponse) { + return [] + } + return refResponse.map((refResponse) => { + return { + refType, + ...refResponse, + } + }) + }) +} + +/* + * Copied and adapted from https://github.com/huggingface/huggingface.js/blob/main/packages/hub + * MIT License, Copyright (c) 2023 Hugging Face + */ + +interface ListFileEntry { + type: 'file' | 'directory' | 'unknown'; + size: number; + path: string; + lastCommit?: { + date: string; + id: string; + }; +} + +const HUB_URL = 'https://huggingface.co' + +/** + * List files in a folder. To list ALL files in the directory, call it + * with {@link params.recursive} set to `true`. + */ +async function* listFiles( + params: { + repoFullName: string; + /** + * Eg 'data' for listing all files in the 'data' folder. Leave it empty to list all + * files in the repo. + */ + path?: string; + revision?: string; + /** + * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. + */ + fetch?: typeof fetch; + accessToken?: string; + } +): AsyncGenerator { + let url: string | undefined = `${HUB_URL}/api/${params.repoFullName}/tree/${ + params.revision ?? 'main' + }${params.path ? '/' + params.path : ''}?expand=true` + + while (url) { + const res: Response = await (params.fetch ?? fetch)(url, { + headers: { + accept: 'application/json', + ...params.accessToken ? { Authorization: `Bearer ${params.accessToken}` } : undefined, + }, + }) + + if (!res.ok) { + throw new Error(`Failed to list files: ${res.status.toString()} ${res.statusText}`) + } + + const items = await res.json() as ListFileEntry[] + + for (const item of items) { + yield item + } + + const linkHeader = res.headers.get('Link') + + url = linkHeader ? parseLinkHeader(linkHeader).next : undefined + } +} + +/** + * Parse Link HTTP header, eg `; rel="next"` + */ +export function parseLinkHeader(header: string): Record { + const regex = /<(https?:[/][/][^>]+)>;\s+rel="([^"]+)"/g + + return Object.fromEntries([...header.matchAll(regex)].map(([, url, rel]) => [rel, url])) as Record +} diff --git a/src/lib/sources/index.ts b/src/lib/sources/index.ts index 52437851..648af03d 100644 --- a/src/lib/sources/index.ts +++ b/src/lib/sources/index.ts @@ -1,5 +1,6 @@ export { getHttpSource } from './httpSource.js' export { getHyperparamSource } from './hyperparamSource.js' +export { getHuggingFaceSource } from './huggingFaceSource.js' export type { HyperparamFileMetadata } from './hyperparamSource.js' export type { DirSource, FileKind, FileMetadata, FileSource, Source, SourcePart } from './types.js' export { getFileName } from './utils.js' diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts new file mode 100644 index 00000000..4fa073c6 --- /dev/null +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -0,0 +1,207 @@ +import { describe, expect, it, test } from 'vitest' +import { parseHuggingFaceUrl } from '../../../src/lib/sources/huggingFaceSource.js' + +const origin = 'https://huggingface.co' + +describe('parseHuggingFaceUrl', () => { + test.for([ + 'huggingface.co', + 'huggingface.com', + 'hf.co', + ])('accepts domain: %s', (domain) => { + const origin = `https://${domain}` + const url = `${origin}/datasets/namespace/repo` + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'directory', + origin, + repo: 'namespace/repo', + type: 'dataset', + source: url, + action: 'tree', + branch: 'main', + path: '', + }) + }) + it('throws for unsupported scheme or domain', () => { + expect(() => parseHuggingFaceUrl('ftp://huggingface.co/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('email://huggingface.co/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('http://huggingface.co/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('https://hf.com/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('https://github.com/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('huggingface.co/datasets/namespace/repo')).toThrow() + }) + + test.for([ + '', + '/', + // for the following tests, the same is true: + // - with a trailing slash + // - replacing /datasets with /anything, /spaces, /models or /. + // Avoiding for brevity. + '/datasets', + '/datasets/namespace', + '/datasets/namespace/repo/branch', + '/datasets/namespace/repo/tree', + '/datasets/namespace/repo/blob', + '/datasets/namespace/repo/resolve', + '/datasets/namespace/repo/blob/branch', + '/datasets/namespace/repo/resolve/branch', + // note the trailing slash + '/datasets/namespace/repo/blob/branch/file/', + '/datasets/namespace/repo/resolve/branch/file/', + ])('throws for invalid path: %s', (path) => { + expect(() => parseHuggingFaceUrl(`https://huggingface.co${path}`)).to.throw() + }) + + test.for([ + { type: 'dataset', typePath: 'datasets/' }, + { type: 'space', typePath: 'spaces/' }, + { type: 'model', typePath: '' }, + ].flatMap(({ type, typePath }) => [ + // Root directory + [ + `https://huggingface.co/${typePath}namespace/repo`, + `https://huggingface.co/${typePath}namespace/repo`, + 'namespace/repo', + type, + 'main', + '', + ], + [ + `https://huggingface.co/${typePath}namespace/repo/`, + `https://huggingface.co/${typePath}namespace/repo/`, + 'namespace/repo', + type, + 'main', + '', + ], + // all-number identifier is not a valid HF repo name, but we accept any string + [ + `https://huggingface.co/${typePath}namespace/123`, + `https://huggingface.co/${typePath}namespace/123`, + 'namespace/123', + type, + 'main', + '', + ], + // Branches + [ + `https://huggingface.co/${typePath}namespace/repo/tree/branch`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch`, + 'namespace/repo', + type, + 'branch', + '', + ], + [ + `https://huggingface.co/${typePath}namespace/repo/tree/branch/`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch`, + 'namespace/repo', + type, + 'branch', + '', + ], + // special case: both forms 'refs/convert/parquet' and 'refs%2Fconvert%2Fparquet' are accepted + // see note in https://url.spec.whatwg.org/#dom-urlsearchparams-urlsearchparams + [ + `https://huggingface.co/${typePath}namespace/repo/tree/refs%2Fconvert%2Fparquet`, + `https://huggingface.co/${typePath}namespace/repo/tree/refs%2Fconvert%2Fparquet`, + 'namespace/repo', + type, + 'refs%2Fconvert%2Fparquet', + '', + ], + [ + `https://huggingface.co/${typePath}namespace/repo/tree/refs/convert/parquet`, + `https://huggingface.co/${typePath}namespace/repo/tree/refs%2Fconvert%2Fparquet`, + 'namespace/repo', + type, + 'refs%2Fconvert%2Fparquet', + '', + ], + // PRs are also accepted + [ + `https://huggingface.co/${typePath}namespace/repo/tree/refs/pr/9`, + `https://huggingface.co/${typePath}namespace/repo/tree/refs%2Fpr%2F9`, + 'namespace/repo', + type, + 'refs%2Fpr%2F9', + '', + ], + // Subdirectories + [ + `https://huggingface.co/${typePath}namespace/repo/tree/branch/folder`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch/folder`, + 'namespace/repo', + type, + 'branch', + '/folder', + ], + [ + `https://huggingface.co/${typePath}namespace/repo/tree/branch/a/b/c/`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch/a/b/c`, + 'namespace/repo', + type, + 'branch', + '/a/b/c', + ], + // A subdirectory can have a dot in its name (what matters is 'tree' vs 'blob' or 'resolve') + [ + `https://huggingface.co/${typePath}namespace/repo/tree/branch/folder.parquet`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch/folder.parquet`, + 'namespace/repo', + type, + 'branch', + '/folder.parquet', + ], + ]))( + 'parses a DirectoryUrl for $type root or subdirectory: %s', + ([url, source, repo, type, branch, path]) => { + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'directory', + origin, + repo, + type, + source, + action: 'tree', + branch, + path, + }) + } + ) + + const repo = 'namespace/repo' + const path = '/path/to/file.parquet' + test.for( + [ + { type: 'dataset', typePath: 'datasets/' }, + { type: 'space', typePath: 'spaces/' }, + { type: 'model', typePath: '' }, + ].flatMap(d => [ + { ...d, branch: 'branch', sanitizedBranch: 'branch' }, + { ...d, branch: 'refs/convert/parquet', sanitizedBranch: 'refs%2Fconvert%2Fparquet' }, + { ...d, branch: 'refs%2Fconvert%2Fparquet', sanitizedBranch: 'refs%2Fconvert%2Fparquet' }, + ]).flatMap(d => [ + { ...d, action: 'blob' }, + { ...d, action: 'resolve' }, + ]).flatMap(d => [ + { ...d, url: `https://huggingface.co/${d.typePath}${repo}/${d.action}/${d.branch}${path}` }, + ]))( + 'parses a FileUrl for $type file URL: $url', + ({ type, typePath, sanitizedBranch, action, url }) => { + const source = `https://huggingface.co/${typePath}${repo}/${action}/${sanitizedBranch}${path}` + const resolveUrl = `https://huggingface.co/${typePath}${repo}/resolve/${sanitizedBranch}${path}` + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'file', + origin, + repo, + type, + source, + action, + branch: sanitizedBranch, + path, + resolveUrl, + }) + } + ) +})