thread-master/worker/threads-keyword-search.ts

275 lines
8.8 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { chromium, type BrowserContext, type BrowserContextOptions, type Page } from 'playwright'
export type KeywordSearchPost = {
text: string
permalink?: string
externalId?: string
authorName?: string
authorVerified?: boolean
followerCount?: number
likeCount?: number
replyCount?: number
}
const USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
const BROWSER_ARGS = [
'--disable-blink-features=AutomationControlled',
'--no-first-run',
'--no-default-browser-check',
]
const STEALTH_INIT_SCRIPT = () => {
Object.defineProperty(navigator, 'webdriver', { get: () => false })
}
type ThreadsPost = KeywordSearchPost
function walkJson(data: unknown, visit: (obj: Record<string, unknown>) => void): void {
if (!data || typeof data !== 'object') return
if (Array.isArray(data)) {
for (const item of data) walkJson(item, visit)
return
}
const obj = data as Record<string, unknown>
visit(obj)
for (const value of Object.values(obj)) {
if (value && typeof value === 'object') walkJson(value, visit)
}
}
function readUserMeta(obj: Record<string, unknown>): { verified: boolean; followers: number } {
const user = (obj.user ?? obj.owner) as Record<string, unknown> | undefined
if (!user || typeof user !== 'object') {
return { verified: false, followers: 0 }
}
const verified = user.is_verified === true
let followers = 0
const direct = user.follower_count
if (typeof direct === 'number' && direct > 0) {
followers = direct
} else {
const edge = user.edge_followed_by as { count?: number } | undefined
if (typeof edge?.count === 'number' && edge.count > 0) {
followers = edge.count
}
}
return { verified, followers }
}
function getPostText(obj: Record<string, unknown>): string | undefined {
return (
(obj.caption as { text?: string } | undefined)?.text ??
(obj.text_post_app_info as { text?: string } | undefined)?.text ??
(typeof obj.text === 'string' ? obj.text : undefined)
)
}
function extractPostsFromJson(data: unknown, posts: ThreadsPost[]): void {
walkJson(data, (obj) => {
const text = getPostText(obj)
if (!text || text.length < 5) return
const code = (obj.code as string) ?? (obj.pk as string) ?? (obj.id as string)
const username =
(obj.user as { username?: string } | undefined)?.username ??
(obj.owner as { username?: string } | undefined)?.username
const meta = readUserMeta(obj)
posts.push({
externalId: code ? String(code) : undefined,
text,
permalink:
code && username ? `https://www.threads.com/@${username}/post/${code}` : undefined,
authorName: username,
authorVerified: meta.verified || undefined,
followerCount: meta.followers > 0 ? meta.followers : undefined,
likeCount: (obj.like_count as number) ?? undefined,
replyCount:
(obj.text_post_app_info as { direct_reply_count?: number } | undefined)?.direct_reply_count ??
(obj.reply_count as number),
})
})
}
async function extractFromPageScripts(page: Page): Promise<ThreadsPost[]> {
const posts: ThreadsPost[] = []
const scripts = await page.locator('script[type="application/json"][data-sjs]').all()
for (const script of scripts) {
try {
const raw = await script.textContent()
if (!raw || !raw.includes('thread_items')) continue
const json = JSON.parse(raw)
extractPostsFromJson(json, posts)
} catch {
// skip malformed script
}
}
return posts
}
function attachCollector(page: Page, collected: ThreadsPost[]) {
page.on('response', async (response) => {
const url = response.url()
if (!url.includes('graphql') && !url.includes('threads') && !url.includes('instagram')) return
try {
const contentType = response.headers()['content-type'] ?? ''
if (!contentType.includes('json')) return
const json = await response.json()
extractPostsFromJson(json, collected)
} catch {
// ignore
}
})
}
async function scrapeSearchDom(page: Page): Promise<ThreadsPost[]> {
const posts: ThreadsPost[] = []
const seen = new Set<string>()
const links = page.locator('a[href*="/post/"]')
const count = await links.count()
for (let i = 0; i < Math.min(count, 40); i++) {
const link = links.nth(i)
try {
const href = await link.getAttribute('href', { timeout: 1200 })
if (!href || seen.has(href)) continue
seen.add(href)
const permalink = href.startsWith('http') ? href : `https://www.threads.com${href}`
const authorName = href.match(/@([^/]+)\/post/)?.[1]
const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]")
const scope =
(await container.count()) > 0 ? container : link.locator('xpath=ancestor::div[position()<=6]').first()
const text = await scope
.locator('div[dir="auto"], span[dir="auto"]')
.first()
.innerText({ timeout: 1200 })
.catch(() => '')
if (!text || text.length < 5) continue
posts.push({
text: text.trim(),
permalink,
authorName,
externalId: href.match(/\/post\/([^/?]+)/)?.[1],
})
} catch {
// skip
}
}
return posts
}
async function humanLandingPause(page: Page) {
await page.waitForTimeout(1200 + Math.floor(Math.random() * 1800))
}
async function humanScrollPage(page: Page) {
const passes = 2 + Math.floor(Math.random() * 3)
for (let i = 0; i < passes; i++) {
await page.mouse.wheel(0, 500 + Math.floor(Math.random() * 700))
await page.waitForTimeout(800 + Math.floor(Math.random() * 1200))
}
}
function dedupePosts(posts: ThreadsPost[], limit: number): KeywordSearchPost[] {
const seen = new Set<string>()
const out: KeywordSearchPost[] = []
for (const post of posts) {
const key = post.externalId ?? post.permalink ?? post.text.slice(0, 120)
if (seen.has(key)) continue
seen.add(key)
out.push({
text: post.text,
permalink: post.permalink,
externalId: post.externalId,
authorName: post.authorName,
authorVerified: post.authorVerified,
followerCount: post.followerCount,
likeCount: post.likeCount,
replyCount: post.replyCount,
})
if (out.length >= limit) break
}
return out
}
async function createBrowserContext(storageState: string): Promise<{ context: BrowserContext; close: () => Promise<void> }> {
const browser = await chromium.launch({
headless: process.env.PLAYWRIGHT_HEADLESS !== 'false',
args: BROWSER_ARGS,
})
let parsedState: unknown
try {
parsedState = JSON.parse(storageState)
} catch {
await browser.close()
throw new Error('瀏覽器 session 資料損毀,請到連線設定重新同步 Chrome extension')
}
const context = await browser.newContext({
storageState: parsedState as BrowserContextOptions['storageState'],
userAgent: USER_AGENT,
viewport: { width: 1280, height: 900 },
locale: 'zh-TW',
timezoneId: 'Asia/Taipei',
})
await context.addInitScript(STEALTH_INIT_SCRIPT)
return {
context,
close: async () => {
await context.close()
await browser.close()
},
}
}
/** Playwright keyword search on threads.com (dev_mode crawler). */
export async function searchKeywords(
storageState: string,
query: string,
limit = 12,
): Promise<KeywordSearchPost[]> {
const keyword = query.trim()
if (!keyword) return []
if (!storageState.trim()) {
throw new Error('找不到 Chrome session請先到連線頁同步 Threads 登入態')
}
const { context, close } = await createBrowserContext(storageState)
try {
const page = await context.newPage()
const collected: ThreadsPost[] = []
attachCollector(page, collected)
await page.goto('https://www.threads.com/', { waitUntil: 'domcontentloaded', timeout: 45_000 })
await humanLandingPause(page)
const homeText = await page.locator('body').innerText().catch(() => '')
if (page.url().includes('/login') || homeText.includes('登入')) {
throw new Error('Session 已失效,請到連線頁重新同步 Chrome Session')
}
const searchUrl = `https://www.threads.com/search?q=${encodeURIComponent(keyword)}&serp_type=default`
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 45_000 })
await humanLandingPause(page)
const bodyText = await page.locator('body').innerText()
if (bodyText.includes('走丟') || bodyText.includes('頁面不存在')) {
return []
}
await page.waitForSelector('a[href*="/post/"]', { timeout: 12_000 }).catch(() => undefined)
await humanScrollPage(page)
if (collected.length < 3) {
collected.push(...(await extractFromPageScripts(page)))
}
if (collected.length < 3) {
collected.push(...(await scrapeSearchDom(page)))
}
return dedupePosts(collected, limit)
} finally {
await close()
}
}