275 lines
8.8 KiB
TypeScript
275 lines
8.8 KiB
TypeScript
import { chromium, type BrowserContext, type BrowserContextOptions, type Page } from 'playwright'
|
||
|
||
export type KeywordSearchPost = {
|
||
text: string
|
||
permalink?: string
|
||
externalId?: string
|
||
authorName?: string
|
||
authorVerified?: boolean
|
||
followerCount?: number
|
||
likeCount?: number
|
||
replyCount?: number
|
||
}
|
||
|
||
const USER_AGENT =
|
||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
|
||
|
||
const BROWSER_ARGS = [
|
||
'--disable-blink-features=AutomationControlled',
|
||
'--no-first-run',
|
||
'--no-default-browser-check',
|
||
]
|
||
|
||
const STEALTH_INIT_SCRIPT = () => {
|
||
Object.defineProperty(navigator, 'webdriver', { get: () => false })
|
||
}
|
||
|
||
type ThreadsPost = KeywordSearchPost
|
||
|
||
function walkJson(data: unknown, visit: (obj: Record<string, unknown>) => void): void {
|
||
if (!data || typeof data !== 'object') return
|
||
if (Array.isArray(data)) {
|
||
for (const item of data) walkJson(item, visit)
|
||
return
|
||
}
|
||
const obj = data as Record<string, unknown>
|
||
visit(obj)
|
||
for (const value of Object.values(obj)) {
|
||
if (value && typeof value === 'object') walkJson(value, visit)
|
||
}
|
||
}
|
||
|
||
function readUserMeta(obj: Record<string, unknown>): { verified: boolean; followers: number } {
|
||
const user = (obj.user ?? obj.owner) as Record<string, unknown> | undefined
|
||
if (!user || typeof user !== 'object') {
|
||
return { verified: false, followers: 0 }
|
||
}
|
||
const verified = user.is_verified === true
|
||
let followers = 0
|
||
const direct = user.follower_count
|
||
if (typeof direct === 'number' && direct > 0) {
|
||
followers = direct
|
||
} else {
|
||
const edge = user.edge_followed_by as { count?: number } | undefined
|
||
if (typeof edge?.count === 'number' && edge.count > 0) {
|
||
followers = edge.count
|
||
}
|
||
}
|
||
return { verified, followers }
|
||
}
|
||
|
||
function getPostText(obj: Record<string, unknown>): string | undefined {
|
||
return (
|
||
(obj.caption as { text?: string } | undefined)?.text ??
|
||
(obj.text_post_app_info as { text?: string } | undefined)?.text ??
|
||
(typeof obj.text === 'string' ? obj.text : undefined)
|
||
)
|
||
}
|
||
|
||
function extractPostsFromJson(data: unknown, posts: ThreadsPost[]): void {
|
||
walkJson(data, (obj) => {
|
||
const text = getPostText(obj)
|
||
if (!text || text.length < 5) return
|
||
|
||
const code = (obj.code as string) ?? (obj.pk as string) ?? (obj.id as string)
|
||
const username =
|
||
(obj.user as { username?: string } | undefined)?.username ??
|
||
(obj.owner as { username?: string } | undefined)?.username
|
||
const meta = readUserMeta(obj)
|
||
|
||
posts.push({
|
||
externalId: code ? String(code) : undefined,
|
||
text,
|
||
permalink:
|
||
code && username ? `https://www.threads.com/@${username}/post/${code}` : undefined,
|
||
authorName: username,
|
||
authorVerified: meta.verified || undefined,
|
||
followerCount: meta.followers > 0 ? meta.followers : undefined,
|
||
likeCount: (obj.like_count as number) ?? undefined,
|
||
replyCount:
|
||
(obj.text_post_app_info as { direct_reply_count?: number } | undefined)?.direct_reply_count ??
|
||
(obj.reply_count as number),
|
||
})
|
||
})
|
||
}
|
||
|
||
async function extractFromPageScripts(page: Page): Promise<ThreadsPost[]> {
|
||
const posts: ThreadsPost[] = []
|
||
const scripts = await page.locator('script[type="application/json"][data-sjs]').all()
|
||
for (const script of scripts) {
|
||
try {
|
||
const raw = await script.textContent()
|
||
if (!raw || !raw.includes('thread_items')) continue
|
||
const json = JSON.parse(raw)
|
||
extractPostsFromJson(json, posts)
|
||
} catch {
|
||
// skip malformed script
|
||
}
|
||
}
|
||
return posts
|
||
}
|
||
|
||
function attachCollector(page: Page, collected: ThreadsPost[]) {
|
||
page.on('response', async (response) => {
|
||
const url = response.url()
|
||
if (!url.includes('graphql') && !url.includes('threads') && !url.includes('instagram')) return
|
||
try {
|
||
const contentType = response.headers()['content-type'] ?? ''
|
||
if (!contentType.includes('json')) return
|
||
const json = await response.json()
|
||
extractPostsFromJson(json, collected)
|
||
} catch {
|
||
// ignore
|
||
}
|
||
})
|
||
}
|
||
|
||
async function scrapeSearchDom(page: Page): Promise<ThreadsPost[]> {
|
||
const posts: ThreadsPost[] = []
|
||
const seen = new Set<string>()
|
||
const links = page.locator('a[href*="/post/"]')
|
||
const count = await links.count()
|
||
|
||
for (let i = 0; i < Math.min(count, 40); i++) {
|
||
const link = links.nth(i)
|
||
try {
|
||
const href = await link.getAttribute('href', { timeout: 1200 })
|
||
if (!href || seen.has(href)) continue
|
||
seen.add(href)
|
||
const permalink = href.startsWith('http') ? href : `https://www.threads.com${href}`
|
||
const authorName = href.match(/@([^/]+)\/post/)?.[1]
|
||
const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]")
|
||
const scope =
|
||
(await container.count()) > 0 ? container : link.locator('xpath=ancestor::div[position()<=6]').first()
|
||
const text = await scope
|
||
.locator('div[dir="auto"], span[dir="auto"]')
|
||
.first()
|
||
.innerText({ timeout: 1200 })
|
||
.catch(() => '')
|
||
if (!text || text.length < 5) continue
|
||
posts.push({
|
||
text: text.trim(),
|
||
permalink,
|
||
authorName,
|
||
externalId: href.match(/\/post\/([^/?]+)/)?.[1],
|
||
})
|
||
} catch {
|
||
// skip
|
||
}
|
||
}
|
||
return posts
|
||
}
|
||
|
||
async function humanLandingPause(page: Page) {
|
||
await page.waitForTimeout(1200 + Math.floor(Math.random() * 1800))
|
||
}
|
||
|
||
async function humanScrollPage(page: Page) {
|
||
const passes = 2 + Math.floor(Math.random() * 3)
|
||
for (let i = 0; i < passes; i++) {
|
||
await page.mouse.wheel(0, 500 + Math.floor(Math.random() * 700))
|
||
await page.waitForTimeout(800 + Math.floor(Math.random() * 1200))
|
||
}
|
||
}
|
||
|
||
function dedupePosts(posts: ThreadsPost[], limit: number): KeywordSearchPost[] {
|
||
const seen = new Set<string>()
|
||
const out: KeywordSearchPost[] = []
|
||
for (const post of posts) {
|
||
const key = post.externalId ?? post.permalink ?? post.text.slice(0, 120)
|
||
if (seen.has(key)) continue
|
||
seen.add(key)
|
||
out.push({
|
||
text: post.text,
|
||
permalink: post.permalink,
|
||
externalId: post.externalId,
|
||
authorName: post.authorName,
|
||
authorVerified: post.authorVerified,
|
||
followerCount: post.followerCount,
|
||
likeCount: post.likeCount,
|
||
replyCount: post.replyCount,
|
||
})
|
||
if (out.length >= limit) break
|
||
}
|
||
return out
|
||
}
|
||
|
||
async function createBrowserContext(storageState: string): Promise<{ context: BrowserContext; close: () => Promise<void> }> {
|
||
const browser = await chromium.launch({
|
||
headless: process.env.PLAYWRIGHT_HEADLESS !== 'false',
|
||
args: BROWSER_ARGS,
|
||
})
|
||
let parsedState: unknown
|
||
try {
|
||
parsedState = JSON.parse(storageState)
|
||
} catch {
|
||
await browser.close()
|
||
throw new Error('瀏覽器 session 資料損毀,請到連線設定重新同步 Chrome extension')
|
||
}
|
||
|
||
const context = await browser.newContext({
|
||
storageState: parsedState as BrowserContextOptions['storageState'],
|
||
userAgent: USER_AGENT,
|
||
viewport: { width: 1280, height: 900 },
|
||
locale: 'zh-TW',
|
||
timezoneId: 'Asia/Taipei',
|
||
})
|
||
await context.addInitScript(STEALTH_INIT_SCRIPT)
|
||
return {
|
||
context,
|
||
close: async () => {
|
||
await context.close()
|
||
await browser.close()
|
||
},
|
||
}
|
||
}
|
||
|
||
/** Playwright keyword search on threads.com (dev_mode crawler). */
|
||
export async function searchKeywords(
|
||
storageState: string,
|
||
query: string,
|
||
limit = 12,
|
||
): Promise<KeywordSearchPost[]> {
|
||
const keyword = query.trim()
|
||
if (!keyword) return []
|
||
if (!storageState.trim()) {
|
||
throw new Error('找不到 Chrome session,請先到連線頁同步 Threads 登入態')
|
||
}
|
||
|
||
const { context, close } = await createBrowserContext(storageState)
|
||
try {
|
||
const page = await context.newPage()
|
||
const collected: ThreadsPost[] = []
|
||
attachCollector(page, collected)
|
||
|
||
await page.goto('https://www.threads.com/', { waitUntil: 'domcontentloaded', timeout: 45_000 })
|
||
await humanLandingPause(page)
|
||
const homeText = await page.locator('body').innerText().catch(() => '')
|
||
if (page.url().includes('/login') || homeText.includes('登入')) {
|
||
throw new Error('Session 已失效,請到連線頁重新同步 Chrome Session')
|
||
}
|
||
|
||
const searchUrl = `https://www.threads.com/search?q=${encodeURIComponent(keyword)}&serp_type=default`
|
||
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 45_000 })
|
||
await humanLandingPause(page)
|
||
|
||
const bodyText = await page.locator('body').innerText()
|
||
if (bodyText.includes('走丟') || bodyText.includes('頁面不存在')) {
|
||
return []
|
||
}
|
||
|
||
await page.waitForSelector('a[href*="/post/"]', { timeout: 12_000 }).catch(() => undefined)
|
||
await humanScrollPage(page)
|
||
|
||
if (collected.length < 3) {
|
||
collected.push(...(await extractFromPageScripts(page)))
|
||
}
|
||
if (collected.length < 3) {
|
||
collected.push(...(await scrapeSearchDom(page)))
|
||
}
|
||
|
||
return dedupePosts(collected, limit)
|
||
} finally {
|
||
await close()
|
||
}
|
||
} |