haixunMaster/lib/threads-browser/profile.ts

import { rankAndDedupe, type RankedPost } from "@/lib/ranking";
import { withSharedContext } from "./browser";
import { humanLandingPause, humanScrollPage } from "./human-behavior";
import type { BrowserProgressCallback } from "./progress";
import { extractFromPageScripts, extractPostsFromJson } from "./extract";
import type { ThreadsPost } from "./types";
import { assertThreadsPageSafe } from "./safety";

function attachCollector(page: import("playwright").Page, collected: ThreadsPost[]) {
  page.on("response", async (response) => {
    const url = response.url();
    if (!url.includes("graphql") && !url.includes("threads") && !url.includes("instagram")) return;
    try {
      const contentType = response.headers()["content-type"] ?? "";
      if (!contentType.includes("json")) return;
      const json = await response.json();
      extractPostsFromJson(json, collected);
    } catch {
      // ignore
    }
  });
}

async function scrapeProfileDom(page: import("playwright").Page, username: string): Promise<ThreadsPost[]> {
  const posts: ThreadsPost[] = [];
  const seen = new Set<string>();
  const links = page.locator('a[href*="/post/"]');
  const count = await links.count();

  for (let i = 0; i < Math.min(count, 35); i++) {
    const link = links.nth(i);
    try {
      const href = await link.getAttribute("href", { timeout: 1200 });
      if (!href || seen.has(href)) continue;
      seen.add(href);
      const permalink = href.startsWith("http") ? href : `https://www.threads.com${href}`;
      const authorName = href.match(/@([^/]+)\/post/)?.[1] ?? username;
      const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]");
      const scope = (await container.count()) > 0 ? container : link.locator("xpath=ancestor::div[position()<=6]").first();
      const text = await scope.locator('div[dir="auto"], span[dir="auto"]').first().innerText({ timeout: 1200 }).catch(() => "");
      if (!text || text.length < 3) continue;
      posts.push({
        text: text.trim(),
        permalink,
        authorName,
        externalId: href.match(/\/post\/([^/?]+)/)?.[1],
      });
    } catch {
      // skip
    }
  }
  return posts;
}

/** 在既有 page 上抓取帳號近期貼文 */
export async function getProfilePostsOnPage(
  page: import("playwright").Page,
  username: string,
  limit = 15,
  onStep?: BrowserProgressCallback
): Promise<RankedPost[]> {
  const clean = username.replace(/^@/, "").trim();
  const collected: ThreadsPost[] = [];
  attachCollector(page, collected);

  const profileUrl = `https://www.threads.com/@${encodeURIComponent(clean)}`;
  await onStep?.("open_page", `@${clean}`);
  const response = await page.goto(profileUrl, { waitUntil: "domcontentloaded", timeout: 45000 });
  await onStep?.("landing_pause");
  await humanLandingPause(page);

  const bodyText = await page.locator("body").innerText();
  await assertThreadsPageSafe(page, response?.status(), bodyText);
  if (bodyText.includes("走丟") || bodyText.includes("找不到")) {
    await onStep?.("done", "0 篇");
    return [];
  }

  await onStep?.("wait_content");
  await page.waitForSelector('a[href*="/post/"]', { timeout: 10000 }).catch(() => undefined);
  await onStep?.("scroll");
  await humanScrollPage(page, { minPasses: 2, maxPasses: 4 });

  if (collected.length < 2) {
    await onStep?.("parse_scripts");
    const { posts } = await extractFromPageScripts(page);
    collected.push(...posts);
  } else {
    await onStep?.("parse_network", `${collected.length} 篇`);
  }
  if (collected.length < 2) {
    await onStep?.("parse_dom");
    collected.push(...(await scrapeProfileDom(page, clean)));
  }

  const ranked = rankAndDedupe(
    collected.map((p) => ({ ...p, authorName: p.authorName ?? clean })),
    limit
  );
  await onStep?.("done", `${ranked.length} 篇`);
  return ranked;
}

export async function getProfilePosts(
  storageState: string,
  username: string,
  limit = 15
): Promise<RankedPost[]> {
  return withSharedContext(storageState, async (context) => {
    const page = await context.newPage();
    return getProfilePostsOnPage(page, username, limit);
  });
}
feat init 2026-06-21 12:50:31 +00:00			`import { rankAndDedupe, type RankedPost } from "@/lib/ranking";`
			`import { withSharedContext } from "./browser";`
			`import { humanLandingPause, humanScrollPage } from "./human-behavior";`
			`import type { BrowserProgressCallback } from "./progress";`
			`import { extractFromPageScripts, extractPostsFromJson } from "./extract";`
			`import type { ThreadsPost } from "./types";`
			`import { assertThreadsPageSafe } from "./safety";`

			`function attachCollector(page: import("playwright").Page, collected: ThreadsPost[]) {`
			`page.on("response", async (response) => {`
			`const url = response.url();`
			`if (!url.includes("graphql") && !url.includes("threads") && !url.includes("instagram")) return;`
			`try {`
			`const contentType = response.headers()["content-type"] ?? "";`
			`if (!contentType.includes("json")) return;`
			`const json = await response.json();`
			`extractPostsFromJson(json, collected);`
			`} catch {`
			`// ignore`
			`}`
			`});`
			`}`

			`async function scrapeProfileDom(page: import("playwright").Page, username: string): Promise<ThreadsPost[]> {`
			`const posts: ThreadsPost[] = [];`
			`const seen = new Set<string>();`
			`const links = page.locator('a[href*="/post/"]');`
			`const count = await links.count();`

			`for (let i = 0; i < Math.min(count, 35); i++) {`
			`const link = links.nth(i);`
			`try {`
			`const href = await link.getAttribute("href", { timeout: 1200 });`
			`if (!href \|\| seen.has(href)) continue;`
			`seen.add(href);`
			const permalink = href.startsWith("http") ? href : `https://www.threads.com${href}`;
			`const authorName = href.match(/@([^/]+)\/post/)?.[1] ?? username;`
			`const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]");`
			`const scope = (await container.count()) > 0 ? container : link.locator("xpath=ancestor::div[position()<=6]").first();`
			`const text = await scope.locator('div[dir="auto"], span[dir="auto"]').first().innerText({ timeout: 1200 }).catch(() => "");`
			`if (!text \|\| text.length < 3) continue;`
			`posts.push({`
			`text: text.trim(),`
			`permalink,`
			`authorName,`
			`externalId: href.match(/\/post\/([^/?]+)/)?.[1],`
			`});`
			`} catch {`
			`// skip`
			`}`
			`}`
			`return posts;`
			`}`

			`/** 在既有 page 上抓取帳號近期貼文 */`
			`export async function getProfilePostsOnPage(`
			`page: import("playwright").Page,`
			`username: string,`
			`limit = 15,`
			`onStep?: BrowserProgressCallback`
			`): Promise<RankedPost[]> {`
			`const clean = username.replace(/^@/, "").trim();`
			`const collected: ThreadsPost[] = [];`
			`attachCollector(page, collected);`

			const profileUrl = `https://www.threads.com/@${encodeURIComponent(clean)}`;
			await onStep?.("open_page", `@${clean}`);
			`const response = await page.goto(profileUrl, { waitUntil: "domcontentloaded", timeout: 45000 });`
			`await onStep?.("landing_pause");`
			`await humanLandingPause(page);`

			`const bodyText = await page.locator("body").innerText();`
			`await assertThreadsPageSafe(page, response?.status(), bodyText);`
			`if (bodyText.includes("走丟") \|\| bodyText.includes("找不到")) {`
			`await onStep?.("done", "0 篇");`
			`return [];`
			`}`

			`await onStep?.("wait_content");`
			`await page.waitForSelector('a[href*="/post/"]', { timeout: 10000 }).catch(() => undefined);`
			`await onStep?.("scroll");`
			`await humanScrollPage(page, { minPasses: 2, maxPasses: 4 });`

			`if (collected.length < 2) {`
			`await onStep?.("parse_scripts");`
			`const { posts } = await extractFromPageScripts(page);`
			`collected.push(...posts);`
			`} else {`
			await onStep?.("parse_network", `${collected.length} 篇`);
			`}`
			`if (collected.length < 2) {`
			`await onStep?.("parse_dom");`
			`collected.push(...(await scrapeProfileDom(page, clean)));`
			`}`

			`const ranked = rankAndDedupe(`
			`collected.map((p) => ({ ...p, authorName: p.authorName ?? clean })),`
			`limit`
			`);`
			await onStep?.("done", `${ranked.length} 篇`);
			`return ranked;`
			`}`

			`export async function getProfilePosts(`
			`storageState: string,`
			`username: string,`
			`limit = 15`
			`): Promise<RankedPost[]> {`
			`return withSharedContext(storageState, async (context) => {`
			`const page = await context.newPage();`
			`return getProfilePostsOnPage(page, username, limit);`
			`});`
			`}`