import { chromium, type BrowserContext, type BrowserContextOptions, type Page } from 'playwright' export type KeywordSearchPost = { text: string permalink?: string externalId?: string authorName?: string authorVerified?: boolean followerCount?: number likeCount?: number replyCount?: number } const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' const BROWSER_ARGS = [ '--disable-blink-features=AutomationControlled', '--no-first-run', '--no-default-browser-check', ] const STEALTH_INIT_SCRIPT = () => { Object.defineProperty(navigator, 'webdriver', { get: () => false }) } type ThreadsPost = KeywordSearchPost function walkJson(data: unknown, visit: (obj: Record) => void): void { if (!data || typeof data !== 'object') return if (Array.isArray(data)) { for (const item of data) walkJson(item, visit) return } const obj = data as Record visit(obj) for (const value of Object.values(obj)) { if (value && typeof value === 'object') walkJson(value, visit) } } function readUserMeta(obj: Record): { verified: boolean; followers: number } { const user = (obj.user ?? obj.owner) as Record | undefined if (!user || typeof user !== 'object') { return { verified: false, followers: 0 } } const verified = user.is_verified === true let followers = 0 const direct = user.follower_count if (typeof direct === 'number' && direct > 0) { followers = direct } else { const edge = user.edge_followed_by as { count?: number } | undefined if (typeof edge?.count === 'number' && edge.count > 0) { followers = edge.count } } return { verified, followers } } function getPostText(obj: Record): string | undefined { return ( (obj.caption as { text?: string } | undefined)?.text ?? (obj.text_post_app_info as { text?: string } | undefined)?.text ?? (typeof obj.text === 'string' ? obj.text : undefined) ) } function extractPostsFromJson(data: unknown, posts: ThreadsPost[]): void { walkJson(data, (obj) => { const text = getPostText(obj) if (!text || text.length < 5) return const code = (obj.code as string) ?? (obj.pk as string) ?? (obj.id as string) const username = (obj.user as { username?: string } | undefined)?.username ?? (obj.owner as { username?: string } | undefined)?.username const meta = readUserMeta(obj) posts.push({ externalId: code ? String(code) : undefined, text, permalink: code && username ? `https://www.threads.com/@${username}/post/${code}` : undefined, authorName: username, authorVerified: meta.verified || undefined, followerCount: meta.followers > 0 ? meta.followers : undefined, likeCount: (obj.like_count as number) ?? undefined, replyCount: (obj.text_post_app_info as { direct_reply_count?: number } | undefined)?.direct_reply_count ?? (obj.reply_count as number), }) }) } async function extractFromPageScripts(page: Page): Promise { const posts: ThreadsPost[] = [] const scripts = await page.locator('script[type="application/json"][data-sjs]').all() for (const script of scripts) { try { const raw = await script.textContent() if (!raw || !raw.includes('thread_items')) continue const json = JSON.parse(raw) extractPostsFromJson(json, posts) } catch { // skip malformed script } } return posts } function attachCollector(page: Page, collected: ThreadsPost[]) { page.on('response', async (response) => { const url = response.url() if (!url.includes('graphql') && !url.includes('threads') && !url.includes('instagram')) return try { const contentType = response.headers()['content-type'] ?? '' if (!contentType.includes('json')) return const json = await response.json() extractPostsFromJson(json, collected) } catch { // ignore } }) } async function scrapeSearchDom(page: Page): Promise { const posts: ThreadsPost[] = [] const seen = new Set() const links = page.locator('a[href*="/post/"]') const count = await links.count() for (let i = 0; i < Math.min(count, 40); i++) { const link = links.nth(i) try { const href = await link.getAttribute('href', { timeout: 1200 }) if (!href || seen.has(href)) continue seen.add(href) const permalink = href.startsWith('http') ? href : `https://www.threads.com${href}` const authorName = href.match(/@([^/]+)\/post/)?.[1] const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]") const scope = (await container.count()) > 0 ? container : link.locator('xpath=ancestor::div[position()<=6]').first() const text = await scope .locator('div[dir="auto"], span[dir="auto"]') .first() .innerText({ timeout: 1200 }) .catch(() => '') if (!text || text.length < 5) continue posts.push({ text: text.trim(), permalink, authorName, externalId: href.match(/\/post\/([^/?]+)/)?.[1], }) } catch { // skip } } return posts } async function humanLandingPause(page: Page) { await page.waitForTimeout(1200 + Math.floor(Math.random() * 1800)) } async function humanScrollPage(page: Page) { const passes = 2 + Math.floor(Math.random() * 3) for (let i = 0; i < passes; i++) { await page.mouse.wheel(0, 500 + Math.floor(Math.random() * 700)) await page.waitForTimeout(800 + Math.floor(Math.random() * 1200)) } } function dedupePosts(posts: ThreadsPost[], limit: number): KeywordSearchPost[] { const seen = new Set() const out: KeywordSearchPost[] = [] for (const post of posts) { const key = post.externalId ?? post.permalink ?? post.text.slice(0, 120) if (seen.has(key)) continue seen.add(key) out.push({ text: post.text, permalink: post.permalink, externalId: post.externalId, authorName: post.authorName, authorVerified: post.authorVerified, followerCount: post.followerCount, likeCount: post.likeCount, replyCount: post.replyCount, }) if (out.length >= limit) break } return out } async function createBrowserContext(storageState: string): Promise<{ context: BrowserContext; close: () => Promise }> { const browser = await chromium.launch({ headless: process.env.PLAYWRIGHT_HEADLESS !== 'false', args: BROWSER_ARGS, }) let parsedState: unknown try { parsedState = JSON.parse(storageState) } catch { await browser.close() throw new Error('瀏覽器 session 資料損毀,請到連線設定重新同步 Chrome extension') } const context = await browser.newContext({ storageState: parsedState as BrowserContextOptions['storageState'], userAgent: USER_AGENT, viewport: { width: 1280, height: 900 }, locale: 'zh-TW', timezoneId: 'Asia/Taipei', }) await context.addInitScript(STEALTH_INIT_SCRIPT) return { context, close: async () => { await context.close() await browser.close() }, } } /** Playwright keyword search on threads.com (dev_mode crawler). */ export async function searchKeywords( storageState: string, query: string, limit = 12, ): Promise { const keyword = query.trim() if (!keyword) return [] if (!storageState.trim()) { throw new Error('找不到 Chrome session,請先到連線頁同步 Threads 登入態') } const { context, close } = await createBrowserContext(storageState) try { const page = await context.newPage() const collected: ThreadsPost[] = [] attachCollector(page, collected) await page.goto('https://www.threads.com/', { waitUntil: 'domcontentloaded', timeout: 45_000 }) await humanLandingPause(page) const homeText = await page.locator('body').innerText().catch(() => '') if (page.url().includes('/login') || homeText.includes('登入')) { throw new Error('Session 已失效,請到連線頁重新同步 Chrome Session') } const searchUrl = `https://www.threads.com/search?q=${encodeURIComponent(keyword)}&serp_type=default` await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 45_000 }) await humanLandingPause(page) const bodyText = await page.locator('body').innerText() if (bodyText.includes('走丟') || bodyText.includes('頁面不存在')) { return [] } await page.waitForSelector('a[href*="/post/"]', { timeout: 12_000 }).catch(() => undefined) await humanScrollPage(page) if (collected.length < 3) { collected.push(...(await extractFromPageScripts(page))) } if (collected.length < 3) { collected.push(...(await scrapeSearchDom(page))) } return dedupePosts(collected, limit) } finally { await close() } }