|
1 | | -import * as cheerio from 'cheerio'; |
2 | | -import { extractNumberFromString } from './utils'; |
| 1 | +import cheerio from 'cheerio'; |
| 2 | +import { convertISO8601ToMs, extractNumberFromString } from './utils'; |
| 3 | + |
| 4 | +const makeGetMeta = ($root) => { |
| 5 | + const getMetaContent = (meta) => { |
| 6 | + const result = |
| 7 | + $root(`div#watch7-content meta[itemprop="${meta}"]`) |
| 8 | + .first() |
| 9 | + .attr('content') || |
| 10 | + $root(`div#watch7-content span[itemprop="${meta}"] link[itemprop="name"]`) |
| 11 | + .first() |
| 12 | + .attr('content'); |
| 13 | + return result; |
| 14 | + }; |
| 15 | + return getMetaContent; |
| 16 | +}; |
3 | 17 |
|
4 | | -const parseVideoNoJs = (html: string) => { |
| 18 | +const parseVideoNoJs = (html: string): any => { |
5 | 19 | const $root = cheerio.load(html); |
6 | | - |
7 | | - if ( |
8 | | - $root('div#watch7-content meta[itemprop="unlisted"]') |
9 | | - .first() |
10 | | - .attr('content') == null |
11 | | - ) |
12 | | - return null; |
13 | | - |
14 | | - const unlisted = |
15 | | - $root('div#watch7-content meta[itemprop="unlisted"]') |
16 | | - .first() |
17 | | - .attr('content') === 'True'; |
18 | | - |
19 | | - const viewCountEl = $root( |
20 | | - 'div#watch7-content meta[itemprop="interactionCount"]', |
21 | | - ).first(); |
22 | | - const viewCount = extractNumberFromString(viewCountEl.attr('content') || '0'); |
23 | | - |
24 | | - const categoryEl = $root('div#watch7-content meta[itemprop="genre"]').first(); |
25 | | - const category = categoryEl.attr('content'); |
26 | | - |
27 | | - const publishedAtEl = $root( |
28 | | - 'div#watch7-content meta[itemprop="datePublished"]', |
29 | | - ).first(); |
30 | | - const publishedAt = publishedAtEl.attr('content'); |
31 | | - |
32 | | - return { unlisted, viewCount, category, publishedAt }; |
| 20 | + const getMeta = makeGetMeta($root); |
| 21 | + |
| 22 | + if (getMeta('unlisted') == null) return null; |
| 23 | + |
| 24 | + const unlisted = getMeta('unlisted') === 'True'; |
| 25 | + const viewCount = extractNumberFromString(getMeta('interactionCount') || '0'); |
| 26 | + const category = getMeta('genre'); |
| 27 | + const publishedAt = getMeta('datePublished'); |
| 28 | + const duration = convertISO8601ToMs(getMeta('duration')); |
| 29 | + const author = getMeta('author'); |
| 30 | + const title = getMeta('name'); |
| 31 | + |
| 32 | + const result = { |
| 33 | + unlisted, |
| 34 | + viewCount, |
| 35 | + category, |
| 36 | + publishedAt, |
| 37 | + title, |
| 38 | + duration, |
| 39 | + author, |
| 40 | + }; |
| 41 | + |
| 42 | + const reKeywords = /<meta name="keywords" content="([^"]*)">/; |
| 43 | + const matchKeywords = html.match(reKeywords); |
| 44 | + if (matchKeywords && matchKeywords.length >= 2) |
| 45 | + result['keywords'] = matchKeywords[1]; |
| 46 | + |
| 47 | + // Alternative way to get duration with regex |
| 48 | + // const reDuration = /"approxDurationMs":"(\d+)"/; |
| 49 | + // const matchedDuration = html.match(reDuration); |
| 50 | + // if (matchedDuration && matchedDuration.length >= 2) |
| 51 | + // result['duration'] = parseInt(matchedDuration[1]); |
| 52 | + |
| 53 | + return result; |
33 | 54 | }; |
34 | 55 |
|
35 | 56 | export { parseVideoNoJs }; |
0 commit comments