diff --git a/.tool-versions b/.tool-versions index 8700ffc4..5066ad7b 100644 --- a/.tool-versions +++ b/.tool-versions @@ -1,5 +1,5 @@ nodejs 20.15.0 java corretto-19.0.1.10.1 -aws-sam-cli 1.135.0 +aws-sam-cli 1.148.0 python 3.12.2 uv 0.9.5 \ No newline at end of file diff --git a/api/dependencies/package-lock.json b/api/dependencies/package-lock.json index 38c2e875..3300cac8 100644 --- a/api/dependencies/package-lock.json +++ b/api/dependencies/package-lock.json @@ -1,12 +1,12 @@ { "name": "dc-api-dependencies", - "version": "2.8.1", + "version": "2.9.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "dc-api-dependencies", - "version": "2.8.1", + "version": "2.9.0", "license": "Apache-2.0", "dependencies": { "@aws-crypto/sha256-browser": "^2.0.1", diff --git a/api/dependencies/package.json b/api/dependencies/package.json index 69344877..df27629f 100644 --- a/api/dependencies/package.json +++ b/api/dependencies/package.json @@ -1,6 +1,6 @@ { "name": "dc-api-dependencies", - "version": "2.8.1", + "version": "2.9.0", "description": "NUL Digital Collections API Dependencies", "repository": "https://github.com/nulib/dc-api-v2", "author": "nulib", diff --git a/api/package-lock.json b/api/package-lock.json index 97df8f37..466ba128 100644 --- a/api/package-lock.json +++ b/api/package-lock.json @@ -1,12 +1,12 @@ { "name": "dc-api-build", - "version": "2.8.1", + "version": "2.9.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "dc-api-build", - "version": "2.8.1", + "version": "2.9.0", "hasInstallScript": true, "license": "Apache-2.0", "dependencies": { diff --git a/api/package.json b/api/package.json index 3437dada..4163321b 100644 --- a/api/package.json +++ b/api/package.json @@ -1,6 +1,6 @@ { "name": "dc-api-build", - "version": "2.8.1", + "version": "2.9.0", "description": "NUL Digital Collections API Build Environment", "repository": "https://github.com/nulib/dc-api-v2", "author": "nulib", diff --git a/api/src/api/response/iiif/manifest.js b/api/src/api/response/iiif/manifest.js index 14e6a0fd..daf0be48 100644 --- a/api/src/api/response/iiif/manifest.js +++ b/api/src/api/response/iiif/manifest.js @@ -1,8 +1,14 @@ const { IIIFBuilder } = require("iiif-builder"); -const { dcApiEndpoint, dcUrl } = require("../../../environment"); +const { + dcApiEndpoint, + dcUrl, + openSearchEndpoint, +} = require("../../../environment"); const { transformError } = require("../error"); +const { getFileSet } = require("../../opensearch"); const { addSupplementingAnnotationToCanvas, + addTranscriptionAnnotationsToCanvas, addThumbnailToCanvas, buildAnnotationBody, buildImageResourceId, @@ -18,7 +24,7 @@ const { } = require("./presentation-api/placeholder-canvas"); const { nulLogo, provider } = require("./presentation-api/provider"); -function transform(response) { +async function transform(response, options = {}) { if (response.statusCode === 200) { const builder = new IIIFBuilder(); @@ -27,6 +33,9 @@ function transform(response) { const manifestId = `${dcApiEndpoint()}/works/${source.id}?as=iiif`; + const transcriptionMap = await fetchFileSetTranscriptions(source, options); + const transcriptionPages = {}; + const normalizedFlatManifestObj = builder.createManifest( manifestId, (manifest) => { @@ -64,6 +73,22 @@ function transform(response) { if (!isAuxiliary && fileSet.webvtt) { addSupplementingAnnotationToCanvas(canvas, canvasId, fileSet); } + + /** Add transcription annotations */ + const transcriptions = transcriptionMap[fileSet.id]; + if ( + source.work_type === "Image" && + fileSet.role === "Access" && + transcriptions?.length + ) { + const pageId = `${canvasId}/annotations/page/0`; + addTranscriptionAnnotationsToCanvas( + canvas, + canvasId, + transcriptions + ); + transcriptionPages[pageId] = transcriptions; + } }); } @@ -275,6 +300,22 @@ function transform(response) { primaryFileSet ); } + + /** Add transcription annotations */ + const transcriptions = transcriptionMap[primaryFileSet.id]; + if ( + source.work_type === "Image" && + primaryFileSet.role === "Access" && + transcriptions?.length + ) { + const pageId = `${canvasId}/annotations/page/0`; + addTranscriptionAnnotationsToCanvas( + canvas, + canvasId, + transcriptions + ); + transcriptionPages[pageId] = transcriptions; + } }); } ); @@ -320,6 +361,19 @@ function transform(response) { } } } + + /** Re-do transcription text in annotation bodies as it's getting stripped somehow */ + const annotationPages = jsonManifest.items[i]?.annotations || []; + annotationPages.forEach((page) => { + const pageTranscriptions = transcriptionPages[page.id]; + if (!pageTranscriptions?.length) return; + page.items?.forEach((annotation, idx) => { + const sourceTranscription = pageTranscriptions[idx]; + if (!sourceTranscription) return; + if (!annotation.body) annotation.body = {}; + annotation.body.value = getTranscriptionContent(sourceTranscription); + }); + }); } jsonManifest.provider = [provider]; @@ -338,4 +392,42 @@ function transform(response) { return transformError(response); } +async function fetchFileSetTranscriptions(source, options) { + if (source.work_type !== "Image") return {}; + if (!openSearchEndpoint()) return {}; + + const candidates = (source.file_sets || []).filter( + (file_set) => file_set.role === "Access" && file_set.id + ); + + const allowPrivate = options.allowPrivate || false; + const allowUnpublished = options.allowUnpublished || false; + + const results = await Promise.all( + candidates.map(async (file_set) => { + const response = await getFileSet(file_set.id, { + allowPrivate, + allowUnpublished, + }); + if (response.statusCode !== 200) return null; + const body = JSON.parse(response.body); + const annotations = + body?._source?.annotations?.filter( + (annotation) => annotation.type === "transcription" + ) || []; + if (annotations.length === 0) return null; + return { id: file_set.id, annotations }; + }) + ); + + return results + .filter(Boolean) + .reduce((acc, { id, annotations }) => ({ ...acc, [id]: annotations }), {}); +} + +function getTranscriptionContent(annotation = {}) { + const value = annotation.content ?? ""; + return typeof value === "string" ? value : ""; +} + module.exports = { transform }; diff --git a/api/src/api/response/iiif/presentation-api/items.js b/api/src/api/response/iiif/presentation-api/items.js index 3472c78e..9cf080ae 100644 --- a/api/src/api/response/iiif/presentation-api/items.js +++ b/api/src/api/response/iiif/presentation-api/items.js @@ -13,6 +13,26 @@ function addSupplementingAnnotationToCanvas(canvas, canvasId, fileSet) { ); } +function addTranscriptionAnnotationsToCanvas(canvas, canvasId, transcriptions) { + const validTranscriptions = (transcriptions || []).filter( + hasTranscriptionContent + ); + if (validTranscriptions.length === 0) return; + + canvas.createAnnotationPage( + (pageId = `${canvasId}/annotations/page/0`), + (annotationPageBuilder) => { + annotationPageBuilder.addLabel("Transcription", "en"); + validTranscriptions.forEach((annotation, index) => { + annotationPageBuilder.createAnnotation( + buildTranscriptionAnnotation({ annotation, canvasId, pageId, index }) + ); + }); + }, + true + ); +} + function addThumbnailToCanvas(canvas, fileSet) { if (fileSet.representative_image_url) { canvas.addThumbnail({ @@ -84,6 +104,47 @@ function buildSupplementingAnnotation({ canvasId, fileSet }) { }; } +function buildTranscriptionAnnotation({ annotation, canvasId, pageId, index }) { + return { + id: `${pageId}/a${index}`, + type: "Annotation", + motivation: "commenting", + body: buildTranscriptionBody(annotation), + target: canvasId, + }; +} + +function buildTranscriptionBody(annotation) { + const value = getTranscriptionContent(annotation); + + const body = { + type: "TextualBody", + value: value, + format: "text/plain", + }; + const languages = normalizeLanguages(annotation.language); + if (languages.length === 1) { + body.language = languages[0]; + } else if (languages.length > 1) { + body.language = languages; + } + return body; +} + +function normalizeLanguages(value) { + if (!value) return []; + if (Array.isArray(value)) return value.filter(Boolean); + return [value]; +} + +function getTranscriptionContent(annotation = {}) { + return typeof annotation.content === "string" ? annotation.content : ""; +} + +function hasTranscriptionContent(annotation) { + return getTranscriptionContent(annotation) !== ""; +} + function isAltFormat(mimeType) { const acceptedTypes = [ "application/pdf", @@ -107,6 +168,7 @@ function isPDF(mimeType) { module.exports = { addSupplementingAnnotationToCanvas, + addTranscriptionAnnotationsToCanvas, addThumbnailToCanvas, annotationType, buildAnnotationBody, @@ -114,6 +176,7 @@ module.exports = { buildImageResourceId, buildImageService, buildSupplementingAnnotation, + buildTranscriptionAnnotation, isAltFormat, isAudioVideo, isImage, diff --git a/api/src/handlers/get-annotation-by-id.js b/api/src/handlers/get-annotation-by-id.js new file mode 100644 index 00000000..692ffa0c --- /dev/null +++ b/api/src/handlers/get-annotation-by-id.js @@ -0,0 +1,71 @@ +const { wrap } = require("./middleware"); +const { search, getFileSet } = require("../api/opensearch"); +const { prefix, appInfo } = require("../environment"); +const { transformError } = require("../api/response/error"); + +/** + * Retrieves a single annotation by id + */ +exports.handler = wrap(async (event) => { + const annotationId = event.pathParameters.id; + + const searchBody = { + size: 1, + _source: ["id"], + query: { + bool: { + should: [ + { term: { "annotations.id.keyword": annotationId } }, + { term: { "annotations.id": annotationId } }, + ], + minimum_should_match: 1, + }, + }, + }; + + const searchResponse = await search( + prefix("dc-v2-file-set"), + JSON.stringify(searchBody) + ); + + if (searchResponse.statusCode !== 200) { + return transformError(searchResponse); + } + + const searchPayload = JSON.parse(searchResponse.body); + const hit = searchPayload?.hits?.hits?.[0]; + if (!hit) return transformError({ statusCode: 404 }); + + const fileSetId = hit?._source?.id || hit?._id; + if (!fileSetId) return transformError({ statusCode: 404 }); + + const allowPrivate = + event.userToken.isSuperUser() || event.userToken.isReadingRoom(); + const allowUnpublished = event.userToken.isSuperUser(); + const fileSetResponse = await getFileSet(fileSetId, { + allowPrivate, + allowUnpublished, + }); + + if (fileSetResponse.statusCode !== 200) { + return transformError(fileSetResponse); + } + + const fileSetPayload = JSON.parse(fileSetResponse.body); + const annotation = fileSetPayload?._source?.annotations?.find( + (item) => item.id === annotationId + ); + + if (!annotation) return transformError({ statusCode: 404 }); + + return { + statusCode: 200, + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + data: annotation, + info: appInfo(), + }), + }; +}); diff --git a/api/src/handlers/get-file-set-annotations.js b/api/src/handlers/get-file-set-annotations.js new file mode 100644 index 00000000..ab6c8f25 --- /dev/null +++ b/api/src/handlers/get-file-set-annotations.js @@ -0,0 +1,33 @@ +const { wrap } = require("./middleware"); +const { getFileSet } = require("../api/opensearch"); +const { appInfo } = require("../environment"); +const opensearchResponse = require("../api/response/opensearch"); + +/** + * Returns annotations for a FileSet + */ +exports.handler = wrap(async (event) => { + const id = event.pathParameters.id; + const allowPrivate = + event.userToken.isSuperUser() || event.userToken.isReadingRoom(); + const allowUnpublished = event.userToken.isSuperUser(); + + const esResponse = await getFileSet(id, { allowPrivate, allowUnpublished }); + if (esResponse.statusCode !== 200) { + return await opensearchResponse.transform(esResponse); + } + + const body = JSON.parse(esResponse.body); + const annotations = body?._source?.annotations ?? null; + + return { + statusCode: 200, + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + data: annotations, + info: appInfo(), + }), + }; +}); diff --git a/api/src/handlers/get-work-by-id.js b/api/src/handlers/get-work-by-id.js index 5cc2ef88..999d3221 100644 --- a/api/src/handlers/get-work-by-id.js +++ b/api/src/handlers/get-work-by-id.js @@ -22,8 +22,11 @@ exports.handler = wrap(async (event) => { if (as && as === "iiif") { // Make it IIIFy - return manifestResponse.transform(esResponse); - } else { - return await opensearchResponse.transform(esResponse); + return await manifestResponse.transform(esResponse, { + allowPrivate, + allowUnpublished, + }); } + + return await opensearchResponse.transform(esResponse); }); diff --git a/api/src/handlers/oai/verbs.js b/api/src/handlers/oai/verbs.js index 67598c8a..b699f9e1 100644 --- a/api/src/handlers/oai/verbs.js +++ b/api/src/handlers/oai/verbs.js @@ -137,7 +137,8 @@ const listIdentifiers = async ( set, resumptionToken ) => { - if (!metadataPrefix) { + // metadataPrefix is only required when NOT using a resumptionToken + if (!resumptionToken && !metadataPrefix) { return invalidOaiRequest( "badArgument", "Missing required metadataPrefix argument" @@ -232,7 +233,8 @@ const listRecords = async ( set, resumptionToken ) => { - if (!metadataPrefix) { + // metadataPrefix is only required when NOT using a resumptionToken + if (!resumptionToken && !metadataPrefix) { return invalidOaiRequest( "badArgument", "Missing required metadataPrefix argument" diff --git a/api/src/package-lock.json b/api/src/package-lock.json index dd3448f6..86fde1fe 100644 --- a/api/src/package-lock.json +++ b/api/src/package-lock.json @@ -1,12 +1,12 @@ { "name": "dc-api", - "version": "2.8.1", + "version": "2.9.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "dc-api", - "version": "2.8.1", + "version": "2.9.0", "license": "Apache-2.0", "dependencies": { "@aws-crypto/sha256-browser": "^2.0.1", diff --git a/api/src/package.json b/api/src/package.json index 2e0d85a3..fecc04b0 100644 --- a/api/src/package.json +++ b/api/src/package.json @@ -1,6 +1,6 @@ { "name": "dc-api", - "version": "2.8.1", + "version": "2.9.0", "description": "NUL Digital Collections API", "repository": "https://github.com/nulib/dc-api-v2", "author": "nulib", diff --git a/api/template.yaml b/api/template.yaml index 9039c3f6..0dfabd49 100644 --- a/api/template.yaml +++ b/api/template.yaml @@ -334,6 +334,54 @@ Resources: ApiId: !Ref dcApi Path: /file-sets/{id} Method: HEAD + getFileSetAnnotationsFunction: + Type: AWS::Serverless::Function + Condition: DeployAPI + Properties: + Handler: handlers/get-file-set-annotations.handler + Description: Gets annotations for a FileSet. + #* Layers: + #* - !Ref apiDependencies + Policies: + - !Ref SecretsPolicy + - !Ref readIndexPolicy + Events: + ApiGet: + Type: HttpApi + Properties: + ApiId: !Ref dcApi + Path: /file-sets/{id}/annotations + Method: GET + ApiHead: + Type: HttpApi + Properties: + ApiId: !Ref dcApi + Path: /file-sets/{id}/annotations + Method: HEAD + getAnnotationByIdFunction: + Type: AWS::Serverless::Function + Condition: DeployAPI + Properties: + Handler: handlers/get-annotation-by-id.handler + Description: Gets an Annotation by id. + #* Layers: + #* - !Ref apiDependencies + Policies: + - !Ref SecretsPolicy + - !Ref readIndexPolicy + Events: + ApiGet: + Type: HttpApi + Properties: + ApiId: !Ref dcApi + Path: /annotations/{id} + Method: GET + ApiHead: + Type: HttpApi + Properties: + ApiId: !Ref dcApi + Path: /annotations/{id} + Method: HEAD getFileSetAuthFunction: Type: AWS::Serverless::Function Condition: DeployAPI @@ -969,4 +1017,4 @@ Resources: - \ No newline at end of file + diff --git a/api/test/fixtures/mocks/annotation-search-empty.json b/api/test/fixtures/mocks/annotation-search-empty.json new file mode 100644 index 00000000..40a0e9d3 --- /dev/null +++ b/api/test/fixtures/mocks/annotation-search-empty.json @@ -0,0 +1,18 @@ +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } +} diff --git a/api/test/fixtures/mocks/annotation-search-hit.json b/api/test/fixtures/mocks/annotation-search-hit.json new file mode 100644 index 00000000..879f2a9b --- /dev/null +++ b/api/test/fixtures/mocks/annotation-search-hit.json @@ -0,0 +1,28 @@ +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "dev-dc-v2-file-set", + "_type": "_doc", + "_id": "1234", + "_score": 1.0, + "_source": { + "id": "1234" + } + } + ] + } +} diff --git a/api/test/fixtures/mocks/fileset-annotated-1234.json b/api/test/fixtures/mocks/fileset-annotated-1234.json new file mode 100644 index 00000000..cb965cf3 --- /dev/null +++ b/api/test/fixtures/mocks/fileset-annotated-1234.json @@ -0,0 +1,23 @@ +{ + "_index": "dev-dc-v2-file-set", + "_type": "_doc", + "_id": "1234", + "_version": 1, + "found": true, + "_source": { + "id": "1234", + "api_model": "FileSet", + "visibility": "Public", + "published": true, + "mime_type": "image/tiff", + "annotations": [ + { + "id": "36a47020-5410-4dda-a7ca-967fe3885bcd", + "type": "transcription", + "language": ["lg", "en"], + "content": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer vitae nisl a leo faucibus consectetur a vel ante. Vivamus imperdiet, arcu a luctus mollis, libero lectus porta ex, quis dapibus quam lectus id urna. Pellentesque nec eros non dolor pharetra rutrum nec sit amet velit. Ut a nisl augue. Pellentesque fermentum odio risus, eget placerat sem vehicula sodales. Quisque pulvinar urna sit amet mi hendrerit faucibus. Phasellus a maximus est. Fusce bibendum pulvinar ipsum, nec blandit nulla feugiat non. Pellentesque est odio, ornare porta pulvinar sit amet, posuere congue nisi. Nam finibus felis metus, id dignissim nisl condimentum in. Proin convallis, leo ac imperdiet luctus, leo velit pulvinar dolor, ut lacinia massa est eu felis. Phasellus porta efficitur ex eu commodo. In fermentum neque sit amet porttitor pharetra. Sed sit amet pellentesque erat, sit amet accumsan risus. Sed varius condimentum nunc, sed luctus metus pretium nec.", + "model": "us.anthropic.claude-sonnet-4-5-20250929-v1:0" + } + ] + } +} diff --git a/api/test/integration/get-annotations.test.js b/api/test/integration/get-annotations.test.js new file mode 100644 index 00000000..b68d35ea --- /dev/null +++ b/api/test/integration/get-annotations.test.js @@ -0,0 +1,55 @@ +"use strict"; + +const chai = require("chai"); +const expect = chai.expect; +chai.use(require("chai-http")); + +describe("Annotation routes", () => { + helpers.saveEnvironment(); + const mock = helpers.mockIndex(); + + describe("GET /file-sets/{id}/annotations", () => { + const { handler } = requireSource("handlers/get-file-set-annotations"); + + it("returns annotations for a file set", async () => { + mock + .get("/dc-v2-file-set/_doc/1234") + .reply(200, helpers.testFixture("mocks/fileset-annotated-1234.json")); + + const event = helpers + .mockEvent("GET", "/file-sets/{id}/annotations") + .pathParams({ id: 1234 }) + .render(); + const result = await handler(event); + expect(result.statusCode).to.eq(200); + + const body = JSON.parse(result.body); + expect(body.data).to.be.an("array").with.lengthOf(1); + expect(body.data[0].type).to.eq("transcription"); + }); + }); + + describe("GET /annotations/{id}", () => { + const { handler } = requireSource("handlers/get-annotation-by-id"); + + it("returns a single annotation", async () => { + mock + .post("/dc-v2-file-set/_search", () => true) + .reply(200, helpers.testFixture("mocks/annotation-search-hit.json")); + + mock + .get("/dc-v2-file-set/_doc/1234") + .reply(200, helpers.testFixture("mocks/fileset-annotated-1234.json")); + + const event = helpers + .mockEvent("GET", "/annotations/{id}") + .pathParams({ id: "36a47020-5410-4dda-a7ca-967fe3885bcd" }) + .render(); + const result = await handler(event); + expect(result.statusCode).to.eq(200); + + const body = JSON.parse(result.body); + expect(body.data.id).to.eq("36a47020-5410-4dda-a7ca-967fe3885bcd"); + }); + }); +}); diff --git a/api/test/integration/get-doc.test.js b/api/test/integration/get-doc.test.js index ff9203a6..8086da7d 100644 --- a/api/test/integration/get-doc.test.js +++ b/api/test/integration/get-doc.test.js @@ -169,6 +169,90 @@ describe("Doc retrieval routes", () => { }); }); + describe("GET /file-sets/{id}/annotations", () => { + const { handler } = requireSource("handlers/get-file-set-annotations"); + + it("returns annotations for a file-set", async () => { + mock + .get("/dc-v2-file-set/_doc/1234") + .reply(200, helpers.testFixture("mocks/fileset-annotated-1234.json")); + + const event = helpers + .mockEvent("GET", "/file-sets/{id}/annotations") + .pathParams({ id: 1234 }) + .render(); + const result = await handler(event); + expect(result.statusCode).to.eq(200); + + const body = JSON.parse(result.body); + expect(body.data).to.be.an("array").with.lengthOf(1); + expect(body.data[0].type).to.eq("transcription"); + }); + + it("returns null when no annotations exist", async () => { + mock + .get("/dc-v2-file-set/_doc/1234") + .reply(200, helpers.testFixture("mocks/fileset-1234.json")); + + const event = helpers + .mockEvent("GET", "/file-sets/{id}/annotations") + .pathParams({ id: 1234 }) + .render(); + const result = await handler(event); + expect(result.statusCode).to.eq(200); + + const body = JSON.parse(result.body); + expect(body.data).to.eq(null); + }); + }); + + describe("GET /annotations/{id}", () => { + const { handler } = requireSource("handlers/get-annotation-by-id"); + + it("returns a single annotation", async () => { + mock + .post("/dc-v2-file-set/_search", (body) => { + const parsed = + typeof body === "string" + ? JSON.parse(body) + : Buffer.isBuffer(body) + ? JSON.parse(body.toString()) + : body; + expect(parsed.query.bool.should.length).to.eq(2); + return true; + }) + .reply(200, helpers.testFixture("mocks/annotation-search-hit.json")); + + mock + .get("/dc-v2-file-set/_doc/1234") + .reply(200, helpers.testFixture("mocks/fileset-annotated-1234.json")); + + const event = helpers + .mockEvent("GET", "/annotations/{id}") + .pathParams({ + id: "36a47020-5410-4dda-a7ca-967fe3885bcd", + }) + .render(); + const result = await handler(event); + expect(result.statusCode).to.eq(200); + const body = JSON.parse(result.body); + expect(body.data.id).to.eq("36a47020-5410-4dda-a7ca-967fe3885bcd"); + }); + + it("404s when annotation is missing", async () => { + mock + .post("/dc-v2-file-set/_search", () => true) + .reply(200, helpers.testFixture("mocks/annotation-search-empty.json")); + + const event = helpers + .mockEvent("GET", "/annotations/{id}") + .pathParams({ id: "missing" }) + .render(); + const result = await handler(event); + expect(result.statusCode).to.eq(404); + }); + }); + describe("Superuser", () => { helpers.saveEnvironment(); let event; diff --git a/api/test/integration/get-work-by-id.test.js b/api/test/integration/get-work-by-id.test.js index b176f3ff..14c58af8 100644 --- a/api/test/integration/get-work-by-id.test.js +++ b/api/test/integration/get-work-by-id.test.js @@ -51,6 +51,24 @@ describe("Retrieve work by id", () => { .get("/dc-v2-work/_doc/1234") .reply(200, helpers.testFixture("mocks/work-1234.json")); + // Minimal transcription fetch for Access file sets in the fixture + mock + .get("/dc-v2-file-set/_doc/076dcbd8-8c57-40e8-bdf7-dc9153c87a36") + .reply(200, { + _source: { + id: "076dcbd8-8c57-40e8-bdf7-dc9153c87a36", + annotations: [], + }, + }); + mock + .get("/dc-v2-file-set/_doc/51862c1c-c024-45dc-ab26-694bd8ebc16c") + .reply(200, { + _source: { + id: "51862c1c-c024-45dc-ab26-694bd8ebc16c", + annotations: [], + }, + }); + const event = helpers .mockEvent("GET", "/works/{id}") .pathParams({ id: 1234 }) diff --git a/api/test/integration/oai.test.js b/api/test/integration/oai.test.js index 3dc65100..1d7a50a8 100644 --- a/api/test/integration/oai.test.js +++ b/api/test/integration/oai.test.js @@ -207,6 +207,25 @@ describe("Oai routes", () => { expect(resumptionToken).to.not.haveOwnProperty("_text"); }); + it("allows resumptionToken without metadataPrefix for ListRecords", async () => { + mock + .post( + "/_search/scroll/FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFm1jN3ZCajdnUURpbUhad1hIYnNsQmcAAAAAAAB2DhZXbmtMZVF5Q1JsMi1ScGRsYUlHLUtB" + ) + .reply(200, helpers.testFixture("mocks/scroll.json")); + + const body = + "verb=ListRecords&resumptionToken=FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFm1jN3ZCajdnUURpbUhad1hIYnNsQmcAAAAAAAB2DhZXbmtMZVF5Q1JsMi1ScGRsYUlHLUtB"; + const event = helpers.mockEvent("POST", "/oai").body(body).render(); + const result = await handler(event); + expect(result.statusCode).to.eq(200); + expect(result).to.have.header("content-type", /application\/xml/); + const resultBody = convert.xml2js(result.body, xmlOpts); + expect(resultBody["OAI-PMH"].ListRecords.record) + .to.be.an("array") + .and.to.have.lengthOf(12); + }); + it("returns a badResumptionToken error when a resumptionToken expires", async () => { mock .post( @@ -480,6 +499,30 @@ describe("Oai routes", () => { expect(resumptionToken).to.not.haveOwnProperty("_text"); }); + it("allows resumptionToken without metadataPrefix for ListIdentifiers", async () => { + mock + .post( + "/_search/scroll/FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFm1jN3ZCajdnUURpbUhad1hIYnNsQmcAAAAAAAB2DhZXbmtMZVF5Q1JsMi1ScGRsYUlHLUtB" + ) + .reply(200, helpers.testFixture("mocks/scroll.json")); + + const event = helpers + .mockEvent("GET", "/oai") + .queryParams({ + verb: "ListIdentifiers", + resumptionToken: + "FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFm1jN3ZCajdnUURpbUhad1hIYnNsQmcAAAAAAAB2DhZXbmtMZVF5Q1JsMi1ScGRsYUlHLUtB", + }) + .render(); + const result = await handler(event); + expect(result.statusCode).to.eq(200); + expect(result).to.have.header("content-type", /application\/xml/); + const resultBody = convert.xml2js(result.body, xmlOpts); + const resumptionToken = + resultBody["OAI-PMH"].ListIdentifiers.resumptionToken; + expect(resumptionToken["_text"]).to.have.lengthOf(120); + }); + it("returns a badResumptionToken error when a resumptionToken expires", async () => { mock .post( diff --git a/av-download/lambdas/package-lock.json b/av-download/lambdas/package-lock.json index 4da0a216..9138c4fc 100644 --- a/av-download/lambdas/package-lock.json +++ b/av-download/lambdas/package-lock.json @@ -1,12 +1,12 @@ { "name": "lambdas", - "version": "2.8.1", + "version": "2.9.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "lambdas", - "version": "2.8.1", + "version": "2.9.0", "license": "Apache-2.0", "dependencies": { "fluent-ffmpeg": "2.1.3" diff --git a/av-download/lambdas/package.json b/av-download/lambdas/package.json index 8749bcdc..18fc0a6a 100644 --- a/av-download/lambdas/package.json +++ b/av-download/lambdas/package.json @@ -1,6 +1,6 @@ { "name": "lambdas", - "version": "2.8.1", + "version": "2.9.0", "description": "Non-API handler lambdas", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" diff --git a/chat/dependencies/requirements.txt b/chat/dependencies/requirements.txt new file mode 100644 index 00000000..9e1a6cc5 --- /dev/null +++ b/chat/dependencies/requirements.txt @@ -0,0 +1,190 @@ +# This file was autogenerated by uv via the following command: +# uv export --format requirements-txt --no-hashes +annotated-types==0.7.0 + # via pydantic +anyio==4.11.0 + # via + # httpx + # openai +boto3==1.40.47 + # via + # dc-api-v2-chat + # langchain-aws +botocore==1.40.47 + # via + # boto3 + # s3transfer +certifi==2025.10.5 + # via + # httpcore + # httpx + # opensearch-py + # requests +charset-normalizer==3.4.3 + # via requests +colorama==0.4.6 ; sys_platform == 'win32' + # via tqdm +distro==1.9.0 + # via openai +events==0.5 + # via opensearch-py +greenlet==3.2.4 ; (python_full_version < '3.14' and platform_machine == 'AMD64') or (python_full_version < '3.14' and platform_machine == 'WIN32') or (python_full_version < '3.14' and platform_machine == 'aarch64') or (python_full_version < '3.14' and platform_machine == 'amd64') or (python_full_version < '3.14' and platform_machine == 'ppc64le') or (python_full_version < '3.14' and platform_machine == 'win32') or (python_full_version < '3.14' and platform_machine == 'x86_64') + # via sqlalchemy +h11==0.16.0 + # via httpcore +honeybadger==0.23.1 + # via dc-api-v2-chat +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via + # langgraph-sdk + # langsmith + # openai +idna==3.10 + # via + # anyio + # httpx + # requests +jiter==0.11.0 + # via openai +jmespath==1.0.1 + # via + # boto3 + # botocore +jsonpatch==1.33 + # via langchain-core +jsonpointer==3.0.0 + # via jsonpatch +langchain==0.3.27 + # via dc-api-v2-chat +langchain-aws==0.2.35 + # via dc-api-v2-chat +langchain-core==0.3.78 + # via + # langchain + # langchain-aws + # langchain-openai + # langchain-text-splitters + # langgraph + # langgraph-checkpoint + # langgraph-prebuilt +langchain-openai==0.3.35 + # via dc-api-v2-chat +langchain-text-splitters==0.3.11 + # via langchain +langgraph==0.6.8 + # via dc-api-v2-chat +langgraph-checkpoint==2.1.2 + # via + # langgraph + # langgraph-prebuilt +langgraph-prebuilt==0.6.4 + # via langgraph +langgraph-sdk==0.2.9 + # via langgraph +langsmith==0.4.33 + # via + # langchain + # langchain-core +numpy==2.2.6 + # via + # dc-api-v2-chat + # langchain-aws +openai==1.109.1 + # via + # dc-api-v2-chat + # langchain-openai +opensearch-py==2.8.0 + # via dc-api-v2-chat +orjson==3.11.3 + # via + # langgraph-sdk + # langsmith +ormsgpack==1.10.0 + # via langgraph-checkpoint +packaging==25.0 + # via + # langchain-core + # langsmith +psutil==7.1.0 + # via honeybadger +pydantic==2.12.0 + # via + # langchain + # langchain-aws + # langchain-core + # langgraph + # langsmith + # openai +pydantic-core==2.41.1 + # via pydantic +pyjwt==2.6.0 + # via dc-api-v2-chat +python-dateutil==2.9.0.post0 + # via + # botocore + # opensearch-py +python-dotenv==1.0.1 + # via dc-api-v2-chat +pyyaml==6.0.3 + # via + # langchain + # langchain-core +regex==2025.9.18 + # via tiktoken +requests==2.32.5 + # via + # dc-api-v2-chat + # langchain + # langsmith + # opensearch-py + # requests-aws4auth + # requests-toolbelt + # tiktoken +requests-aws4auth==1.3.1 + # via dc-api-v2-chat +requests-toolbelt==1.0.0 + # via langsmith +s3transfer==0.14.0 + # via boto3 +six==1.17.0 + # via + # honeybadger + # python-dateutil +sniffio==1.3.1 + # via + # anyio + # openai +sqlalchemy==2.0.43 + # via langchain +tenacity==9.1.2 + # via langchain-core +tiktoken==0.11.0 + # via + # dc-api-v2-chat + # langchain-openai +tqdm==4.67.1 + # via openai +typing-extensions==4.15.0 + # via + # anyio + # langchain-core + # openai + # pydantic + # pydantic-core + # sqlalchemy + # typing-inspection +typing-inspection==0.4.2 + # via pydantic +urllib3==2.5.0 + # via + # botocore + # opensearch-py + # requests +wheel==0.45.1 + # via dc-api-v2-chat +xxhash==3.6.0 + # via langgraph +zstandard==0.25.0 + # via langsmith diff --git a/chat/pyproject.toml b/chat/pyproject.toml index 07a0d00b..0c880ec5 100644 --- a/chat/pyproject.toml +++ b/chat/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dc-api-v2-chat" -version = "2.8.1" +version = "2.9.0" requires-python = ">=3.12" dependencies = [ "boto3~=1.34", @@ -29,4 +29,4 @@ dev = [ ] [tool.uv] -default-groups = [] \ No newline at end of file +default-groups = [] diff --git a/chat/uv.lock b/chat/uv.lock index ef907a85..6c24d3d6 100644 --- a/chat/uv.lock +++ b/chat/uv.lock @@ -302,7 +302,7 @@ wheels = [ [[package]] name = "dc-api-v2-chat" -version = "2.8.1" +version = "2.9.0" source = { virtual = "." } dependencies = [ { name = "boto3" }, @@ -388,6 +388,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -397,6 +399,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, @@ -404,6 +408,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, + { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] diff --git a/docs/docs/spec/data-types.yaml b/docs/docs/spec/data-types.yaml index 93313b06..61d655b6 100644 --- a/docs/docs/spec/data-types.yaml +++ b/docs/docs/spec/data-types.yaml @@ -216,6 +216,11 @@ components: nullable: true additionalProperties: type: string + annotations: + type: array + nullable: true + items: + $ref: "#/components/schemas/Annotation" extracted_metadata: type: object nullable: true @@ -254,6 +259,28 @@ components: - Auxiliary - Preservation - Supplemental + Annotation: + type: object + properties: + id: + type: string + format: uuid + type: + type: string + language: + type: array + items: + type: string + content: + type: string + model: + type: string + required: + - id + - type + - language + - content + - model GenericIdLabel: type: object nullable: true diff --git a/docs/docs/spec/openapi.yaml b/docs/docs/spec/openapi.yaml index e6b8ca09..009adc1d 100644 --- a/docs/docs/spec/openapi.yaml +++ b/docs/docs/spec/openapi.yaml @@ -168,6 +168,26 @@ paths: responses: 200: $ref: "./types.yaml#/components/responses/DocumentResponse" + /file-sets/{id}/annotations: + get: + operationId: getFileSetAnnotations + tags: + - FileSet + parameters: + - $ref: "./types.yaml#/components/parameters/id" + responses: + 200: + $ref: "./types.yaml#/components/responses/AnnotationsResponse" + /annotations/{id}: + get: + operationId: getAnnotationById + tags: + - Annotation + parameters: + - $ref: "./types.yaml#/components/parameters/id" + responses: + 200: + $ref: "./types.yaml#/components/responses/AnnotationResponse" /file-sets/{id}/authorization: get: operationId: getFileSetAuth diff --git a/docs/docs/spec/types.yaml b/docs/docs/spec/types.yaml index 119bdeea..014eff8d 100644 --- a/docs/docs/spec/types.yaml +++ b/docs/docs/spec/types.yaml @@ -115,6 +115,31 @@ components: $ref: "#/components/schemas/IndexDocument" info: type: object + AnnotationsResponse: + description: Annotations for a FileSet + content: + application/json: + schema: + type: object + properties: + data: + type: array + items: + $ref: "./data-types.yaml#/components/schemas/Annotation" + nullable: true + info: + $ref: "./data-types.yaml#/components/schemas/Info" + AnnotationResponse: + description: A single annotation response + content: + application/json: + schema: + type: object + properties: + data: + $ref: "./data-types.yaml#/components/schemas/Annotation" + info: + $ref: "./data-types.yaml#/components/schemas/Info" SearchResponse: description: A page of search results content: diff --git a/docs/pyproject.toml b/docs/pyproject.toml index e4c315ad..335b1af5 100644 --- a/docs/pyproject.toml +++ b/docs/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dc-api-v2-docs" -version = "2.8.1" +version = "2.9.0" requires-python = ">=3.12" dependencies = [ "mkdocs>=1.1.2,<2.0.0", diff --git a/docs/uv.lock b/docs/uv.lock index 2c645943..445d155a 100644 --- a/docs/uv.lock +++ b/docs/uv.lock @@ -123,7 +123,7 @@ wheels = [ [[package]] name = "dc-api-v2-docs" -version = "2.8.1" +version = "2.9.0" source = { virtual = "." } dependencies = [ { name = "diagrams" },