Skip to content
This repository was archived by the owner on Jun 21, 2023. It is now read-only.

Commit 5f2d0ee

Browse files
committed
Add scraping & dump wrangling to harke
1 parent d414cfd commit 5f2d0ee

File tree

13 files changed

+3934
-43
lines changed

13 files changed

+3934
-43
lines changed

packages/harke/package.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121
"test:dev": "jest --watchAll"
2222
},
2323
"dependencies": {
24-
"cheerio": "^1.0.0-rc.5"
24+
"cheerio": "^1.0.0-rc.5",
25+
"lodash": "^4.17.21",
26+
"@algorithmwatch/scraper": "^0.1.0",
27+
"@algorithmwatch/utils": "^0.1.0"
2528
},
2629
"devDependencies": {
2730
"@types/cheerio": "^0.22.27",

packages/harke/src/dump.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import { pickArray } from '@algorithmwatch/utils';
2+
import _ from 'lodash';
3+
import { scrapeYouTubeVideos } from './scrape';
4+
5+
// Extract videos from watch history
6+
const extractWatchedVideosFromDump = (videoList) => {
7+
return _.map(
8+
videoList,
9+
_.partialRight(_.pick, ['title', 'titleUrl', 'time']),
10+
);
11+
};
12+
13+
const scrapeVideosFromWatchedDump = (videolist, max) => {
14+
const items = extractWatchedVideosFromDump(videolist);
15+
const urls = pickArray(items.slice(0, max), ['titleUrl']).map(
16+
(x) => x.titleUrl,
17+
);
18+
19+
return scrapeYouTubeVideos(urls, {
20+
delay: 0,
21+
saveCache: false,
22+
verbose: true,
23+
storeBrokenHtml: false,
24+
});
25+
};
26+
27+
export { extractWatchedVideosFromDump, scrapeVideosFromWatchedDump };

packages/harke/src/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
import * as constants from './constants';
22

3+
export * from './dump';
34
export * from './extractors/parsePlaylistPage';
45
export * from './extractors/parseSearchHistory';
56
export * from './extractors/parseSearchPage';
67
export * from './extractors/parseSubscribedChannels';
78
export * from './extractors/parseVideoPage';
89
export * from './extractors/parseWatchHistory';
910
export * from './parse-no-js';
11+
export * from './scrape';
1012
export * from './types';
1113
export * from './utils';
1214
export { constants };

packages/harke/src/parse-no-js.ts

Lines changed: 51 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,56 @@
1-
import * as cheerio from 'cheerio';
2-
import { extractNumberFromString } from './utils';
1+
import cheerio from 'cheerio';
2+
import { convertISO8601ToMs, extractNumberFromString } from './utils';
3+
4+
const makeGetMeta = ($root) => {
5+
const getMetaContent = (meta) => {
6+
const result =
7+
$root(`div#watch7-content meta[itemprop="${meta}"]`)
8+
.first()
9+
.attr('content') ||
10+
$root(`div#watch7-content span[itemprop="${meta}"] link[itemprop="name"]`)
11+
.first()
12+
.attr('content');
13+
return result;
14+
};
15+
return getMetaContent;
16+
};
317

4-
const parseVideoNoJs = (html: string) => {
18+
const parseVideoNoJs = (html: string): any => {
519
const $root = cheerio.load(html);
6-
7-
if (
8-
$root('div#watch7-content meta[itemprop="unlisted"]')
9-
.first()
10-
.attr('content') == null
11-
)
12-
return null;
13-
14-
const unlisted =
15-
$root('div#watch7-content meta[itemprop="unlisted"]')
16-
.first()
17-
.attr('content') === 'True';
18-
19-
const viewCountEl = $root(
20-
'div#watch7-content meta[itemprop="interactionCount"]',
21-
).first();
22-
const viewCount = extractNumberFromString(viewCountEl.attr('content') || '0');
23-
24-
const categoryEl = $root('div#watch7-content meta[itemprop="genre"]').first();
25-
const category = categoryEl.attr('content');
26-
27-
const publishedAtEl = $root(
28-
'div#watch7-content meta[itemprop="datePublished"]',
29-
).first();
30-
const publishedAt = publishedAtEl.attr('content');
31-
32-
return { unlisted, viewCount, category, publishedAt };
20+
const getMeta = makeGetMeta($root);
21+
22+
if (getMeta('unlisted') == null) return null;
23+
24+
const unlisted = getMeta('unlisted') === 'True';
25+
const viewCount = extractNumberFromString(getMeta('interactionCount') || '0');
26+
const category = getMeta('genre');
27+
const publishedAt = getMeta('datePublished');
28+
const duration = convertISO8601ToMs(getMeta('duration'));
29+
const author = getMeta('author');
30+
const title = getMeta('name');
31+
32+
const result = {
33+
unlisted,
34+
viewCount,
35+
category,
36+
publishedAt,
37+
title,
38+
duration,
39+
author,
40+
};
41+
42+
const reKeywords = /<meta name="keywords" content="([^"]*)">/;
43+
const matchKeywords = html.match(reKeywords);
44+
if (matchKeywords && matchKeywords.length >= 2)
45+
result['keywords'] = matchKeywords[1];
46+
47+
// Alternative way to get duration with regex
48+
// const reDuration = /"approxDurationMs":"(\d+)"/;
49+
// const matchedDuration = html.match(reDuration);
50+
// if (matchedDuration && matchedDuration.length >= 2)
51+
// result['duration'] = parseInt(matchedDuration[1]);
52+
53+
return result;
3354
};
3455

3556
export { parseVideoNoJs };

packages/harke/src/parse.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
// Importing cheerio as below causes problems in DataSkop's JSON validation.
2-
// import cheerio from 'cheerio';
3-
import * as cheerio from 'cheerio';
1+
import cheerio from 'cheerio';
42
import { ParserFieldParams, ParserResult, ParserResultSlug } from './types';
53
import { extractJsonLinkedData } from './utils';
64

packages/harke/src/scrape.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { scrapeItems } from '@algorithmwatch/scraper';
2+
import { parseVideoNoJs } from './parse-no-js';
3+
4+
const parseFun = (html, storeBrokenHtml, logFun) => {
5+
try {
6+
return parseVideoNoJs(html);
7+
} catch (error) {
8+
logFun(`Failed with: ${error}`);
9+
10+
throw new Error('Parsing error');
11+
}
12+
};
13+
14+
const scrapeYouTubeVideos = async (
15+
videoUrls: string[],
16+
options: any,
17+
logFun = console.log,
18+
): Promise<any> => {
19+
return scrapeItems(
20+
videoUrls,
21+
{},
22+
options,
23+
{},
24+
logFun,
25+
undefined,
26+
undefined,
27+
parseFun,
28+
undefined,
29+
);
30+
};
31+
export { scrapeYouTubeVideos };

packages/harke/test/dump.test.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import * as fs from 'fs';
2+
import {
3+
extractWatchedVideosFromDump,
4+
scrapeVideosFromWatchedDump,
5+
} from '../src';
6+
7+
describe('dump', () => {
8+
let parsedResult: any;
9+
let scrapedResult: any;
10+
11+
beforeAll(async () => {
12+
const filePath = 'test/dumps/2023_02_06_jf_watch-history.json';
13+
const file = JSON.parse(String(fs.readFileSync(filePath)));
14+
parsedResult = extractWatchedVideosFromDump(file);
15+
scrapedResult = await scrapeVideosFromWatchedDump(file, 2);
16+
// console.warn('test', parsedResult);
17+
// console.warn('test', JSON.stringify(scrapedResult));
18+
});
19+
20+
test('dump videos', () => {
21+
expect(parsedResult.length).toBeGreaterThan(5);
22+
});
23+
});

0 commit comments

Comments
 (0)