haixunMaster/lib/threads-browser/profile.ts

114 lines
4.1 KiB
TypeScript
Raw Permalink Normal View History

2026-06-21 12:50:31 +00:00
import { rankAndDedupe, type RankedPost } from "@/lib/ranking";
import { withSharedContext } from "./browser";
import { humanLandingPause, humanScrollPage } from "./human-behavior";
import type { BrowserProgressCallback } from "./progress";
import { extractFromPageScripts, extractPostsFromJson } from "./extract";
import type { ThreadsPost } from "./types";
import { assertThreadsPageSafe } from "./safety";
function attachCollector(page: import("playwright").Page, collected: ThreadsPost[]) {
page.on("response", async (response) => {
const url = response.url();
if (!url.includes("graphql") && !url.includes("threads") && !url.includes("instagram")) return;
try {
const contentType = response.headers()["content-type"] ?? "";
if (!contentType.includes("json")) return;
const json = await response.json();
extractPostsFromJson(json, collected);
} catch {
// ignore
}
});
}
async function scrapeProfileDom(page: import("playwright").Page, username: string): Promise<ThreadsPost[]> {
const posts: ThreadsPost[] = [];
const seen = new Set<string>();
const links = page.locator('a[href*="/post/"]');
const count = await links.count();
for (let i = 0; i < Math.min(count, 35); i++) {
const link = links.nth(i);
try {
const href = await link.getAttribute("href", { timeout: 1200 });
if (!href || seen.has(href)) continue;
seen.add(href);
const permalink = href.startsWith("http") ? href : `https://www.threads.com${href}`;
const authorName = href.match(/@([^/]+)\/post/)?.[1] ?? username;
const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]");
const scope = (await container.count()) > 0 ? container : link.locator("xpath=ancestor::div[position()<=6]").first();
const text = await scope.locator('div[dir="auto"], span[dir="auto"]').first().innerText({ timeout: 1200 }).catch(() => "");
if (!text || text.length < 3) continue;
posts.push({
text: text.trim(),
permalink,
authorName,
externalId: href.match(/\/post\/([^/?]+)/)?.[1],
});
} catch {
// skip
}
}
return posts;
}
/** 在既有 page 上抓取帳號近期貼文 */
export async function getProfilePostsOnPage(
page: import("playwright").Page,
username: string,
limit = 15,
onStep?: BrowserProgressCallback
): Promise<RankedPost[]> {
const clean = username.replace(/^@/, "").trim();
const collected: ThreadsPost[] = [];
attachCollector(page, collected);
const profileUrl = `https://www.threads.com/@${encodeURIComponent(clean)}`;
await onStep?.("open_page", `@${clean}`);
const response = await page.goto(profileUrl, { waitUntil: "domcontentloaded", timeout: 45000 });
await onStep?.("landing_pause");
await humanLandingPause(page);
const bodyText = await page.locator("body").innerText();
await assertThreadsPageSafe(page, response?.status(), bodyText);
if (bodyText.includes("走丟") || bodyText.includes("找不到")) {
await onStep?.("done", "0 篇");
return [];
}
await onStep?.("wait_content");
await page.waitForSelector('a[href*="/post/"]', { timeout: 10000 }).catch(() => undefined);
await onStep?.("scroll");
await humanScrollPage(page, { minPasses: 2, maxPasses: 4 });
if (collected.length < 2) {
await onStep?.("parse_scripts");
const { posts } = await extractFromPageScripts(page);
collected.push(...posts);
} else {
await onStep?.("parse_network", `${collected.length}`);
}
if (collected.length < 2) {
await onStep?.("parse_dom");
collected.push(...(await scrapeProfileDom(page, clean)));
}
const ranked = rankAndDedupe(
collected.map((p) => ({ ...p, authorName: p.authorName ?? clean })),
limit
);
await onStep?.("done", `${ranked.length}`);
return ranked;
}
export async function getProfilePosts(
storageState: string,
username: string,
limit = 15
): Promise<RankedPost[]> {
return withSharedContext(storageState, async (context) => {
const page = await context.newPage();
return getProfilePostsOnPage(page, username, limit);
});
}