import { rankAndDedupe, type RankedPost } from "@/lib/ranking"; import { withSharedContext } from "./browser"; import { humanLandingPause, humanScrollPage } from "./human-behavior"; import type { BrowserProgressCallback } from "./progress"; import { extractFromPageScripts, extractPostsFromJson } from "./extract"; import type { ThreadsPost } from "./types"; import { assertThreadsPageSafe } from "./safety"; function attachCollector(page: import("playwright").Page, collected: ThreadsPost[]) { page.on("response", async (response) => { const url = response.url(); if (!url.includes("graphql") && !url.includes("threads") && !url.includes("instagram")) return; try { const contentType = response.headers()["content-type"] ?? ""; if (!contentType.includes("json")) return; const json = await response.json(); extractPostsFromJson(json, collected); } catch { // ignore } }); } async function scrapeProfileDom(page: import("playwright").Page, username: string): Promise { const posts: ThreadsPost[] = []; const seen = new Set(); const links = page.locator('a[href*="/post/"]'); const count = await links.count(); for (let i = 0; i < Math.min(count, 35); i++) { const link = links.nth(i); try { const href = await link.getAttribute("href", { timeout: 1200 }); if (!href || seen.has(href)) continue; seen.add(href); const permalink = href.startsWith("http") ? href : `https://www.threads.com${href}`; const authorName = href.match(/@([^/]+)\/post/)?.[1] ?? username; const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]"); const scope = (await container.count()) > 0 ? container : link.locator("xpath=ancestor::div[position()<=6]").first(); const text = await scope.locator('div[dir="auto"], span[dir="auto"]').first().innerText({ timeout: 1200 }).catch(() => ""); if (!text || text.length < 3) continue; posts.push({ text: text.trim(), permalink, authorName, externalId: href.match(/\/post\/([^/?]+)/)?.[1], }); } catch { // skip } } return posts; } /** 在既有 page 上抓取帳號近期貼文 */ export async function getProfilePostsOnPage( page: import("playwright").Page, username: string, limit = 15, onStep?: BrowserProgressCallback ): Promise { const clean = username.replace(/^@/, "").trim(); const collected: ThreadsPost[] = []; attachCollector(page, collected); const profileUrl = `https://www.threads.com/@${encodeURIComponent(clean)}`; await onStep?.("open_page", `@${clean}`); const response = await page.goto(profileUrl, { waitUntil: "domcontentloaded", timeout: 45000 }); await onStep?.("landing_pause"); await humanLandingPause(page); const bodyText = await page.locator("body").innerText(); await assertThreadsPageSafe(page, response?.status(), bodyText); if (bodyText.includes("走丟") || bodyText.includes("找不到")) { await onStep?.("done", "0 篇"); return []; } await onStep?.("wait_content"); await page.waitForSelector('a[href*="/post/"]', { timeout: 10000 }).catch(() => undefined); await onStep?.("scroll"); await humanScrollPage(page, { minPasses: 2, maxPasses: 4 }); if (collected.length < 2) { await onStep?.("parse_scripts"); const { posts } = await extractFromPageScripts(page); collected.push(...posts); } else { await onStep?.("parse_network", `${collected.length} 篇`); } if (collected.length < 2) { await onStep?.("parse_dom"); collected.push(...(await scrapeProfileDom(page, clean))); } const ranked = rankAndDedupe( collected.map((p) => ({ ...p, authorName: p.authorName ?? clean })), limit ); await onStep?.("done", `${ranked.length} 篇`); return ranked; } export async function getProfilePosts( storageState: string, username: string, limit = 15 ): Promise { return withSharedContext(storageState, async (context) => { const page = await context.newPage(); return getProfilePostsOnPage(page, username, limit); }); }