thread-master/worker/threads-profile-scraper.ts

import { chromium, type BrowserContext, type BrowserContextOptions, type Page } from 'playwright'

export type ScrapedPost = {
  text: string
  permalink?: string
  likeCount?: number
  replyCount?: number
}

const USER_AGENT =
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'

const BROWSER_ARGS = [
  '--disable-blink-features=AutomationControlled',
  '--no-first-run',
  '--no-default-browser-check',
]

const STEALTH_INIT_SCRIPT = () => {
  Object.defineProperty(navigator, 'webdriver', { get: () => false })
}

type ThreadsPost = ScrapedPost & {
  externalId?: string
  authorName?: string
}

function walkJson(data: unknown, visit: (obj: Record<string, unknown>) => void): void {
  if (!data || typeof data !== 'object') return
  if (Array.isArray(data)) {
    for (const item of data) walkJson(item, visit)
    return
  }
  const obj = data as Record<string, unknown>
  visit(obj)
  for (const value of Object.values(obj)) {
    if (value && typeof value === 'object') walkJson(value, visit)
  }
}

function getPostText(obj: Record<string, unknown>): string | undefined {
  return (
    (obj.caption as { text?: string } | undefined)?.text ??
    (obj.text_post_app_info as { text?: string } | undefined)?.text ??
    (typeof obj.text === 'string' ? obj.text : undefined)
  )
}

function extractPostsFromJson(data: unknown, posts: ThreadsPost[]): void {
  walkJson(data, (obj) => {
    const text = getPostText(obj)
    if (!text || text.length < 5) return

    const code = (obj.code as string) ?? (obj.pk as string) ?? (obj.id as string)
    const username =
      (obj.user as { username?: string } | undefined)?.username ??
      (obj.owner as { username?: string } | undefined)?.username

    posts.push({
      externalId: code ? String(code) : undefined,
      text,
      permalink:
        code && username ? `https://www.threads.com/@${username}/post/${code}` : undefined,
      authorName: username,
      likeCount: (obj.like_count as number) ?? undefined,
      replyCount:
        (obj.text_post_app_info as { direct_reply_count?: number } | undefined)?.direct_reply_count ??
        (obj.reply_count as number),
    })
  })
}

async function extractFromPageScripts(page: Page): Promise<ThreadsPost[]> {
  const posts: ThreadsPost[] = []
  const scripts = await page.locator('script[type="application/json"][data-sjs]').all()
  for (const script of scripts) {
    try {
      const raw = await script.textContent()
      if (!raw || !raw.includes('thread_items')) continue
      const json = JSON.parse(raw)
      extractPostsFromJson(json, posts)
    } catch {
      // skip malformed script
    }
  }
  return posts
}

function attachCollector(page: Page, collected: ThreadsPost[]) {
  page.on('response', async (response) => {
    const url = response.url()
    if (!url.includes('graphql') && !url.includes('threads') && !url.includes('instagram')) return
    try {
      const contentType = response.headers()['content-type'] ?? ''
      if (!contentType.includes('json')) return
      const json = await response.json()
      extractPostsFromJson(json, collected)
    } catch {
      // ignore
    }
  })
}

async function scrapeProfileDom(page: Page, username: string): Promise<ThreadsPost[]> {
  const posts: ThreadsPost[] = []
  const seen = new Set<string>()
  const links = page.locator('a[href*="/post/"]')
  const count = await links.count()

  for (let i = 0; i < Math.min(count, 35); i++) {
    const link = links.nth(i)
    try {
      const href = await link.getAttribute('href', { timeout: 1200 })
      if (!href || seen.has(href)) continue
      seen.add(href)
      const permalink = href.startsWith('http') ? href : `https://www.threads.com${href}`
      const authorName = href.match(/@([^/]+)\/post/)?.[1] ?? username
      const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]")
      const scope =
        (await container.count()) > 0 ? container : link.locator('xpath=ancestor::div[position()<=6]').first()
      const text = await scope
        .locator('div[dir="auto"], span[dir="auto"]')
        .first()
        .innerText({ timeout: 1200 })
        .catch(() => '')
      if (!text || text.length < 3) continue
      posts.push({
        text: text.trim(),
        permalink,
        authorName,
        externalId: href.match(/\/post\/([^/?]+)/)?.[1],
      })
    } catch {
      // skip
    }
  }
  return posts
}

async function humanLandingPause(page: Page) {
  await page.waitForTimeout(1200 + Math.floor(Math.random() * 1800))
}

async function humanScrollPage(page: Page) {
  const passes = 2 + Math.floor(Math.random() * 3)
  for (let i = 0; i < passes; i++) {
    await page.mouse.wheel(0, 500 + Math.floor(Math.random() * 700))
    await page.waitForTimeout(800 + Math.floor(Math.random() * 1200))
  }
}

function dedupePosts(posts: ThreadsPost[], limit: number): ScrapedPost[] {
  const seen = new Set<string>()
  const out: ScrapedPost[] = []
  for (const post of posts) {
    const key = post.externalId ?? post.permalink ?? post.text.slice(0, 120)
    if (seen.has(key)) continue
    seen.add(key)
    out.push({
      text: post.text,
      permalink: post.permalink,
      likeCount: post.likeCount,
      replyCount: post.replyCount,
    })
    if (out.length >= limit) break
  }
  return out
}

async function createBrowserContext(storageState: string): Promise<{ context: BrowserContext; close: () => Promise<void> }> {
  const browser = await chromium.launch({
    headless: process.env.PLAYWRIGHT_HEADLESS !== 'false',
    args: BROWSER_ARGS,
  })
  let parsedState: unknown
  try {
    parsedState = JSON.parse(storageState)
  } catch {
    await browser.close()
    throw new Error('瀏覽器 session 資料損毀，請到連線設定重新同步 Chrome extension')
  }

  const context = await browser.newContext({
    storageState: parsedState as BrowserContextOptions['storageState'],
    userAgent: USER_AGENT,
    viewport: { width: 1280, height: 900 },
    locale: 'zh-TW',
    timezoneId: 'Asia/Taipei',
  })
  await context.addInitScript(STEALTH_INIT_SCRIPT)
  return {
    context,
    close: async () => {
      await context.close()
      await browser.close()
    },
  }
}

/** 沿用舊版 lib/threads-browser/profile.ts 的三段式抓取：GraphQL → 內嵌 JSON → DOM */
export async function getProfilePosts(
  storageState: string,
  username: string,
  limit = 15,
): Promise<ScrapedPost[]> {
  const clean = username.replace(/^@/, '').trim()
  if (!storageState.trim()) {
    throw new Error(
      '找不到 Chrome session。請先在巡樓選定經營帳號，到連線頁用擴充套件同步 Threads 登入態後再跑 8D。',
    )
  }

  const { context, close } = await createBrowserContext(storageState)
  try {
    const page = await context.newPage()
    const collected: ThreadsPost[] = []
    attachCollector(page, collected)

    const profileUrl = `https://www.threads.com/@${encodeURIComponent(clean)}`
    await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 45_000 })
    await humanLandingPause(page)

    const bodyText = await page.locator('body').innerText()
    if (bodyText.includes('走丟') || bodyText.includes('找不到')) {
      return []
    }

    await page.waitForSelector('a[href*="/post/"]', { timeout: 10_000 }).catch(() => undefined)
    await humanScrollPage(page)

    if (collected.length < 2) {
      collected.push(...(await extractFromPageScripts(page)))
    }
    if (collected.length < 2) {
      collected.push(...(await scrapeProfileDom(page, clean)))
    }

    return dedupePosts(
      collected.map((post) => ({ ...post, authorName: post.authorName ?? clean })),
      limit,
    )
  } finally {
    await close()
  }
}