Skip to content

Commit b4dce9d

Browse files
authored
Fix: updating CSS selectors for Perplexity extraction (#286)
* fix: updating CSS content selector * fix: targeting new source list * fix: extracting sources list and extracting URLs using imgs <a> tag when available (as a workaround) * attempt: capture target URL * chore: cleaning code * fix: switch to Search tab to extract user answer * 3.8.8
1 parent 2cc361f commit b4dce9d

File tree

8 files changed

+72
-61
lines changed

8 files changed

+72
-61
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "save-my-chatbot",
3-
"version": "3.8.7",
3+
"version": "3.8.8",
44
"license": "RMD-C 1.1",
55
"author": "Hugo COLLIN",
66
"homepage": "https://save.hugocollin.com",

public/files/updateNotes.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
# Update notes
2+
# 3.8.8
3+
🧩 Fixing Perplexity export
4+
Perplexity changed its interface again, but we're now back!
5+
26
# 3.8.7
37
🚀 Perplexity sources extraction working again!
48
Perplexity changed its interface again, but the extension is now back on track!

src/core/services/format/formatMarkdown.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,11 @@ export function formatMarkdown(html: string | Node): any | string | string {
5959
* @returns {string} formatted link
6060
*/
6161
export function formatLink(url: string | HTMLElement, message: string): string {
62+
// console.log(message)
6263
// @ts-ignore
63-
return "[" + message.replaceAll("`", "") + "](" + url?.replace(/\)/g, "%29") + ")";
64+
const res = "[" + message.replaceAll("`", "") + "](" + url?.replace(/\)/g, "%29") + ")";
65+
console.log(res)
66+
return res;
6467
}
6568

6669
/**

src/core/services/pageExtractor/extractSection.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import {capitalizeFirst, formatLineBreaks} from "../format/formatText";
2-
import {safeExecute} from "../../utils/jsShorteners";
2+
import {safeExecute, sleep} from "../../utils/jsShorteners";
33
import {extractSources} from "./extractSources";
44

55
/**
@@ -31,6 +31,10 @@ export async function extractSection(content: { hasChildNodes: () => any; }, for
3131
async function extractSearchSection(content: HTMLElement, format: (arg0: any) => string, metadata: { extractor: any; sourcesExtraction?: any; }) {
3232
const options = metadata.extractor;
3333

34+
// @ts-ignore
35+
content.querySelector('.whitespace-nowrap.absolute')?.click();
36+
await sleep(100);
37+
3438
// Extract and format the user question
3539
const userQuestionSelector = options.userQuestionSelector ?? 'span, textarea';
3640
const userQuestionElement = content.querySelector(userQuestionSelector) ?? "";

src/core/services/pageExtractor/extractSources.ts

Lines changed: 22 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@ export async function extractSources(content: HTMLElement, format: any, data: {
1818
res = "";
1919
i = 1;
2020

21-
for (const {open, close, selector, extractionType, paginationSelector, content: msgContent} of data.selectors) {
21+
for (const {open, close, selector, extractionType, paginationSelector, content: msgContent, scope: scopeType } of data.selectors) {
2222
open && await safeExecute(await selectAndClick(open, content));
2323

2424
switch (extractionType) {
2525
case 'list':
26-
res = await safeExecute(await extractFromList(format, content, selector ?? msgContent)) as unknown as string;
26+
res = await safeExecute(await extractFromList(format, content, selector ?? msgContent, scopeType)) as unknown as string;
2727
break;
2828
case 'tile-list':
2929
res = await safeExecute(await extractFromTileList(format, content, selector)) as unknown as string;
@@ -103,24 +103,27 @@ function extractFromLinks(links: any[] | NodeListOf<any>, format: (arg0: any) =>
103103
* @param selectorOrContent
104104
* @returns {Promise<string>}
105105
*/
106-
async function extractFromList(format: any, content: HTMLElement, selectorOrContent: { selector: any; scope: any; }): Promise<string> {
106+
async function extractFromList(format: any, content: HTMLElement, selectorOrContent: { selector: any; scope: any; }, scopeType: string): Promise<string> {
107107
let res = '';
108108
let i = 1;
109109

110110
const selector = typeof selectorOrContent === "object"
111111
? selectorOrContent.selector
112112
: selectorOrContent;
113113

114-
const scope = typeof selectorOrContent === "object"
115-
? selectorOrContent.scope
116-
: "document";
114+
const scope = scopeType === "content"
115+
? "content"
116+
: typeof selectorOrContent === "object"
117+
? selectorOrContent.scope
118+
: "document";
117119

118120
const qs = scope === "document"
119121
? document.querySelectorAll(selector)
120122
: content.querySelectorAll(selector);
121123

122124
// console.log(content, scope, qs)
123125
for (const tile of qs) {
126+
// console.log("THE TILE : ", tile)
124127
res += await formatSources(i, format, tile);
125128
i++;
126129
}
@@ -162,11 +165,13 @@ async function extractFromTileList(format: any, content: HTMLElement, selector:
162165
* @returns {Promise<string>}
163166
*/
164167
export async function formatSources(i: string | number, format: (arg0: any) => string, tile: Element): Promise<string> {
165-
const elt: HTMLElement = tile.querySelector("div.default") as HTMLElement //Perplexity
168+
const elt: HTMLElement = tile.querySelector("a") as HTMLElement //Perplexity
166169
|| tile;
167170

171+
const title: HTMLElement = tile.querySelector('.font-display') as HTMLElement
172+
168173
const text = "(" + i + ") "
169-
+ format(elt.innerText
174+
+ format(title.innerText
170175
.replaceAll("\n", " ")
171176
.replaceAll('"', '')
172177
.replace(/^\d+/, "") // Removes numbers at the beginning
@@ -206,53 +211,16 @@ export async function formatSources(i: string | number, format: (arg0: any) => s
206211

207212
// Export content
208213
let res = "- ";
214+
// console.log(tile)
209215
// @ts-ignore TODO
210-
if (tile && tile.href)
216+
// if (tile && tile.href)
211217
// @ts-ignore TODO
212-
res += formatLink(tile.href, text) + "\n";
213-
else {
214-
const url: HTMLElement = await safeExecute(extractYoutubeLink(tile as HTMLElement)) as unknown as HTMLElement;
215-
res += url
216-
? formatLink(url, text) + "\n"
217-
: text + "\n";
218-
}
218+
res += (elt.href ? formatLink(elt.href, text) : text) + "\n";
219+
// else {
220+
// const url: HTMLElement = await safeExecute(extractYoutubeLink(tile as HTMLElement)) as unknown as HTMLElement;
221+
// res += url
222+
// ? formatLink(url, text) + "\n"
223+
// : text + "\n";
224+
// }
219225
return res;
220226
}
221-
222-
223-
// async function extractSourcesOld(msgContent, searchResults, res, format) {
224-
// const buttonsInCard = msgContent[2].querySelectorAll("button");
225-
// for (const btn of buttonsInCard) {
226-
// if (btn.textContent.toLowerCase() === "view all search results") {
227-
// // Open modal
228-
// btn.click();
229-
// await sleep(0); // Needed to wait for the modal to open (even if it's 0!)
230-
//
231-
// // Export sources and all search results, put correct index in front of each link
232-
// let i = 1;
233-
// let allResults = "**All search results:**";
234-
//
235-
// const dialogLinks = Array.from(document.querySelectorAll("[role='dialog'] a"));
236-
// const p2Array = Array.from(searchResults);
237-
// dialogLinks.forEach((link) => {
238-
// // If the link is in the sources, add it to the sources with the correct index
239-
// if (p2Array.find((elt) => elt.getAttribute("href") === link.getAttribute("href"))) {
240-
// res += "\n- " + format(link.outerHTML).replace("[", `[(${i}) `);
241-
// }
242-
//
243-
// // Add the link to the all search results with the correct index
244-
// allResults += "\n- " + format(link.outerHTML).replace("[", `[(${i}) `);
245-
// i++;
246-
// });
247-
//
248-
// // Append all search results after the sources
249-
// res += "\n\n" + allResults;
250-
//
251-
// // Close modal
252-
// document.querySelectorAll("[role='dialog'] [type='button']").forEach((btn) => {
253-
// if (btn.textContent.toLowerCase() === "close") btn.click();
254-
// });
255-
// }
256-
// }
257-
// return res;
258-
// }

src/core/utils/captureTargetUrl.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
export function captureTargetUrl(selecteurElement: { click: () => void; }) {
2+
// Stocker l'URL capturée
3+
let urlCapturee = null;
4+
5+
// Créer un proxy pour window.open
6+
const openOriginal = window.open;
7+
window.open = function (url, target) {
8+
// Intercepter et stocker l'URL
9+
urlCapturee = url;
10+
11+
// Ne pas ouvrir réellement le nouvel onglet
12+
console.log('URL interceptée :', urlCapturee);
13+
14+
// Restaurer la méthode originale immédiatement
15+
window.open = openOriginal;
16+
17+
return null;
18+
};
19+
console.log(window.open)
20+
21+
// @ts-ignore
22+
const eventHandlers = getEventListeners(selecteurElement)
23+
eventHandlers.click.forEach((evt: { listener: { (this: Window, ev: MouseEvent): any; (): null; }; }) => {
24+
console.log(evt)
25+
removeEventListener('click', evt.listener)
26+
evt.listener = () => null
27+
})
28+
console.log(eventHandlers)
29+
selecteurElement.click();
30+
throw new Error("stop")
31+
}

src/features/scraper/domains/Perplexity.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"selector": "h1"
44
},
55
"domainName": "Perplexity.ai",
6-
"contentSelector": "main .mx-auto > div > div > div > div > div .col-span-8",
6+
"contentSelector": "main .scrollable-container > div > div > div > div > div > div:not([class])",
77
"turndown": {
88
"init": {
99
"blankReplacement": "getBlankReplacement"
@@ -40,8 +40,9 @@
4040
"wait": 50
4141
}],
4242
"close": [{"selector": "[data-testid=\"close-modal\"], div.w-full.relative button[aria-label].h-8:not([aria-label=\"Submit\"])", "scope": "document", "wait": 50}],
43-
"selector": ".fixed > div > [class] > div > div > div > div > div > .group, div.w-full.relative > .group",
44-
"extractionType": "list"
43+
"selector": ".relative > div > .flex-col.flex > div.items-start",
44+
"extractionType": "list",
45+
"scope": "content"
4546
},
4647
{
4748
"open": [{"selector": "div.grid > div.flex:nth-last-of-type(1)", "scope": "content", "wait": 50}],

src/features/scraper/rules/rules.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ export function filter_PreserveLineBreaksInPre_Claude(node: { nodeName: string;
9090
}
9191

9292
export function replacement_PreserveLineBreaksInPre_Perplexity(content: any, node: { querySelector: (arg0: string) => any; }) {
93-
console.log("replacement_PreserveLineBreaksInPre_Perplexity", node)
93+
// console.log("replacement_PreserveLineBreaksInPre_Perplexity", node)
9494
const codeBlock = node.querySelector('code');
9595
const codeContent = codeBlock.textContent.trim();
9696
const codeLang = codeBlock.parentNode.parentNode.parentNode.querySelector("div").textContent.trim();

0 commit comments

Comments
 (0)