algorithmwatch
diff --git a/‎packages/harke/package.json‎
Lines changed: 4 additions & 1 deletion b/‎packages/harke/package.json‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎packages/harke/src/dump.ts‎
Lines changed: 27 additions & 0 deletions b/‎packages/harke/src/dump.ts‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎packages/harke/src/index.ts‎
Lines changed: 2 additions & 0 deletions b/‎packages/harke/src/index.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/harke/src/parse-no-js.ts‎
Lines changed: 51 additions & 30 deletions b/‎packages/harke/src/parse-no-js.ts‎
Lines changed: 51 additions & 30 deletions
diff --git a/‎packages/harke/src/parse.ts‎
Lines changed: 1 addition & 3 deletions b/‎packages/harke/src/parse.ts‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎packages/harke/src/scrape.ts‎
Lines changed: 31 additions & 0 deletions b/‎packages/harke/src/scrape.ts‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎packages/harke/test/dump.test.ts‎
Lines changed: 23 additions & 0 deletions b/‎packages/harke/test/dump.test.ts‎
Lines changed: 23 additions & 0 deletions
@@ -21,7 +21,10 @@
     "test:dev": "jest --watchAll"
   },
   "dependencies": {
-    "cheerio": "^1.0.0-rc.5"
+    "cheerio": "^1.0.0-rc.5",
+    "lodash": "^4.17.21",
+    "@algorithmwatch/scraper": "^0.1.0",
+    "@algorithmwatch/utils": "^0.1.0"
   },
   "devDependencies": {
     "@types/cheerio": "^0.22.27",
 
@@ -0,0 +1,27 @@
+import { pickArray } from '@algorithmwatch/utils';
+import _ from 'lodash';
+import { scrapeYouTubeVideos } from './scrape';
+
+// Extract videos from watch history
+const extractWatchedVideosFromDump = (videoList) => {
+  return _.map(
+    videoList,
+    _.partialRight(_.pick, ['title', 'titleUrl', 'time']),
+  );
+};
+
+const scrapeVideosFromWatchedDump = (videolist, max) => {
+  const items = extractWatchedVideosFromDump(videolist);
+  const urls = pickArray(items.slice(0, max), ['titleUrl']).map(
+    (x) => x.titleUrl,
+  );
+
+  return scrapeYouTubeVideos(urls, {
+    delay: 0,
+    saveCache: false,
+    verbose: true,
+    storeBrokenHtml: false,
+  });
+};
+
+export { extractWatchedVideosFromDump, scrapeVideosFromWatchedDump };
@@ -1,12 +1,14 @@
 import * as constants from './constants';
 
+export * from './dump';
 export * from './extractors/parsePlaylistPage';
 export * from './extractors/parseSearchHistory';
 export * from './extractors/parseSearchPage';
 export * from './extractors/parseSubscribedChannels';
 export * from './extractors/parseVideoPage';
 export * from './extractors/parseWatchHistory';
 export * from './parse-no-js';
+export * from './scrape';
 export * from './types';
 export * from './utils';
 export { constants };
@@ -1,35 +1,56 @@
-import * as cheerio from 'cheerio';
-import { extractNumberFromString } from './utils';
+import cheerio from 'cheerio';
+import { convertISO8601ToMs, extractNumberFromString } from './utils';
+
+const makeGetMeta = ($root) => {
+  const getMetaContent = (meta) => {
+    const result =
+      $root(`div#watch7-content meta[itemprop="${meta}"]`)
+        .first()
+        .attr('content') ||
+      $root(`div#watch7-content span[itemprop="${meta}"] link[itemprop="name"]`)
+        .first()
+        .attr('content');
+    return result;
+  };
+  return getMetaContent;
+};
 
-const parseVideoNoJs = (html: string) => {
+const parseVideoNoJs = (html: string): any => {
   const $root = cheerio.load(html);
-
-  if (
-    $root('div#watch7-content meta[itemprop="unlisted"]')
-      .first()
-      .attr('content') == null
-  )
-    return null;
-
-  const unlisted =
-    $root('div#watch7-content meta[itemprop="unlisted"]')
-      .first()
-      .attr('content') === 'True';
-
-  const viewCountEl = $root(
-    'div#watch7-content meta[itemprop="interactionCount"]',
-  ).first();
-  const viewCount = extractNumberFromString(viewCountEl.attr('content') || '0');
-
-  const categoryEl = $root('div#watch7-content meta[itemprop="genre"]').first();
-  const category = categoryEl.attr('content');
-
-  const publishedAtEl = $root(
-    'div#watch7-content meta[itemprop="datePublished"]',
-  ).first();
-  const publishedAt = publishedAtEl.attr('content');
-
-  return { unlisted, viewCount, category, publishedAt };
+  const getMeta = makeGetMeta($root);
+
+  if (getMeta('unlisted') == null) return null;
+
+  const unlisted = getMeta('unlisted') === 'True';
+  const viewCount = extractNumberFromString(getMeta('interactionCount') || '0');
+  const category = getMeta('genre');
+  const publishedAt = getMeta('datePublished');
+  const duration = convertISO8601ToMs(getMeta('duration'));
+  const author = getMeta('author');
+  const title = getMeta('name');
+
+  const result = {
+    unlisted,
+    viewCount,
+    category,
+    publishedAt,
+    title,
+    duration,
+    author,
+  };
+
+  const reKeywords = /<meta name="keywords" content="([^"]*)">/;
+  const matchKeywords = html.match(reKeywords);
+  if (matchKeywords && matchKeywords.length >= 2)
+    result['keywords'] = matchKeywords[1];
+
+  // Alternative way to get duration with regex
+  // const reDuration = /"approxDurationMs":"(\d+)"/;
+  // const matchedDuration = html.match(reDuration);
+  // if (matchedDuration && matchedDuration.length >= 2)
+  //   result['duration'] = parseInt(matchedDuration[1]);
+
+  return result;
 };
 
 export { parseVideoNoJs };
@@ -1,6 +1,4 @@
-// Importing cheerio as below causes problems in DataSkop's JSON validation.
-// import cheerio from 'cheerio';
-import * as cheerio from 'cheerio';
+import cheerio from 'cheerio';
 import { ParserFieldParams, ParserResult, ParserResultSlug } from './types';
 import { extractJsonLinkedData } from './utils';
 
 
@@ -0,0 +1,31 @@
+import { scrapeItems } from '@algorithmwatch/scraper';
+import { parseVideoNoJs } from './parse-no-js';
+
+const parseFun = (html, storeBrokenHtml, logFun) => {
+  try {
+    return parseVideoNoJs(html);
+  } catch (error) {
+    logFun(`Failed with: ${error}`);
+
+    throw new Error('Parsing error');
+  }
+};
+
+const scrapeYouTubeVideos = async (
+  videoUrls: string[],
+  options: any,
+  logFun = console.log,
+): Promise<any> => {
+  return scrapeItems(
+    videoUrls,
+    {},
+    options,
+    {},
+    logFun,
+    undefined,
+    undefined,
+    parseFun,
+    undefined,
+  );
+};
+export { scrapeYouTubeVideos };
@@ -0,0 +1,23 @@
+import * as fs from 'fs';
+import {
+  extractWatchedVideosFromDump,
+  scrapeVideosFromWatchedDump,
+} from '../src';
+
+describe('dump', () => {
+  let parsedResult: any;
+  let scrapedResult: any;
+
+  beforeAll(async () => {
+    const filePath = 'test/dumps/2023_02_06_jf_watch-history.json';
+    const file = JSON.parse(String(fs.readFileSync(filePath)));
+    parsedResult = extractWatchedVideosFromDump(file);
+    scrapedResult = await scrapeVideosFromWatchedDump(file, 2);
+    // console.warn('test', parsedResult);
+    // console.warn('test', JSON.stringify(scrapedResult));
+  });
+
+  test('dump videos', () => {
+    expect(parsedResult.length).toBeGreaterThan(5);
+  });
+});