114 lines
4.1 KiB
TypeScript
114 lines
4.1 KiB
TypeScript
import { rankAndDedupe, type RankedPost } from "@/lib/ranking";
|
|
import { withSharedContext } from "./browser";
|
|
import { humanLandingPause, humanScrollPage } from "./human-behavior";
|
|
import type { BrowserProgressCallback } from "./progress";
|
|
import { extractFromPageScripts, extractPostsFromJson } from "./extract";
|
|
import type { ThreadsPost } from "./types";
|
|
import { assertThreadsPageSafe } from "./safety";
|
|
|
|
function attachCollector(page: import("playwright").Page, collected: ThreadsPost[]) {
|
|
page.on("response", async (response) => {
|
|
const url = response.url();
|
|
if (!url.includes("graphql") && !url.includes("threads") && !url.includes("instagram")) return;
|
|
try {
|
|
const contentType = response.headers()["content-type"] ?? "";
|
|
if (!contentType.includes("json")) return;
|
|
const json = await response.json();
|
|
extractPostsFromJson(json, collected);
|
|
} catch {
|
|
// ignore
|
|
}
|
|
});
|
|
}
|
|
|
|
async function scrapeProfileDom(page: import("playwright").Page, username: string): Promise<ThreadsPost[]> {
|
|
const posts: ThreadsPost[] = [];
|
|
const seen = new Set<string>();
|
|
const links = page.locator('a[href*="/post/"]');
|
|
const count = await links.count();
|
|
|
|
for (let i = 0; i < Math.min(count, 35); i++) {
|
|
const link = links.nth(i);
|
|
try {
|
|
const href = await link.getAttribute("href", { timeout: 1200 });
|
|
if (!href || seen.has(href)) continue;
|
|
seen.add(href);
|
|
const permalink = href.startsWith("http") ? href : `https://www.threads.com${href}`;
|
|
const authorName = href.match(/@([^/]+)\/post/)?.[1] ?? username;
|
|
const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]");
|
|
const scope = (await container.count()) > 0 ? container : link.locator("xpath=ancestor::div[position()<=6]").first();
|
|
const text = await scope.locator('div[dir="auto"], span[dir="auto"]').first().innerText({ timeout: 1200 }).catch(() => "");
|
|
if (!text || text.length < 3) continue;
|
|
posts.push({
|
|
text: text.trim(),
|
|
permalink,
|
|
authorName,
|
|
externalId: href.match(/\/post\/([^/?]+)/)?.[1],
|
|
});
|
|
} catch {
|
|
// skip
|
|
}
|
|
}
|
|
return posts;
|
|
}
|
|
|
|
/** 在既有 page 上抓取帳號近期貼文 */
|
|
export async function getProfilePostsOnPage(
|
|
page: import("playwright").Page,
|
|
username: string,
|
|
limit = 15,
|
|
onStep?: BrowserProgressCallback
|
|
): Promise<RankedPost[]> {
|
|
const clean = username.replace(/^@/, "").trim();
|
|
const collected: ThreadsPost[] = [];
|
|
attachCollector(page, collected);
|
|
|
|
const profileUrl = `https://www.threads.com/@${encodeURIComponent(clean)}`;
|
|
await onStep?.("open_page", `@${clean}`);
|
|
const response = await page.goto(profileUrl, { waitUntil: "domcontentloaded", timeout: 45000 });
|
|
await onStep?.("landing_pause");
|
|
await humanLandingPause(page);
|
|
|
|
const bodyText = await page.locator("body").innerText();
|
|
await assertThreadsPageSafe(page, response?.status(), bodyText);
|
|
if (bodyText.includes("走丟") || bodyText.includes("找不到")) {
|
|
await onStep?.("done", "0 篇");
|
|
return [];
|
|
}
|
|
|
|
await onStep?.("wait_content");
|
|
await page.waitForSelector('a[href*="/post/"]', { timeout: 10000 }).catch(() => undefined);
|
|
await onStep?.("scroll");
|
|
await humanScrollPage(page, { minPasses: 2, maxPasses: 4 });
|
|
|
|
if (collected.length < 2) {
|
|
await onStep?.("parse_scripts");
|
|
const { posts } = await extractFromPageScripts(page);
|
|
collected.push(...posts);
|
|
} else {
|
|
await onStep?.("parse_network", `${collected.length} 篇`);
|
|
}
|
|
if (collected.length < 2) {
|
|
await onStep?.("parse_dom");
|
|
collected.push(...(await scrapeProfileDom(page, clean)));
|
|
}
|
|
|
|
const ranked = rankAndDedupe(
|
|
collected.map((p) => ({ ...p, authorName: p.authorName ?? clean })),
|
|
limit
|
|
);
|
|
await onStep?.("done", `${ranked.length} 篇`);
|
|
return ranked;
|
|
}
|
|
|
|
export async function getProfilePosts(
|
|
storageState: string,
|
|
username: string,
|
|
limit = 15
|
|
): Promise<RankedPost[]> {
|
|
return withSharedContext(storageState, async (context) => {
|
|
const page = await context.newPage();
|
|
return getProfilePostsOnPage(page, username, limit);
|
|
});
|
|
}
|