thread-master/worker/threads-profile-scraper.ts

245 lines
7.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { chromium, type BrowserContext, type BrowserContextOptions, type Page } from 'playwright'
export type ScrapedPost = {
text: string
permalink?: string
likeCount?: number
replyCount?: number
}
const USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
const BROWSER_ARGS = [
'--disable-blink-features=AutomationControlled',
'--no-first-run',
'--no-default-browser-check',
]
const STEALTH_INIT_SCRIPT = () => {
Object.defineProperty(navigator, 'webdriver', { get: () => false })
}
type ThreadsPost = ScrapedPost & {
externalId?: string
authorName?: string
}
function walkJson(data: unknown, visit: (obj: Record<string, unknown>) => void): void {
if (!data || typeof data !== 'object') return
if (Array.isArray(data)) {
for (const item of data) walkJson(item, visit)
return
}
const obj = data as Record<string, unknown>
visit(obj)
for (const value of Object.values(obj)) {
if (value && typeof value === 'object') walkJson(value, visit)
}
}
function getPostText(obj: Record<string, unknown>): string | undefined {
return (
(obj.caption as { text?: string } | undefined)?.text ??
(obj.text_post_app_info as { text?: string } | undefined)?.text ??
(typeof obj.text === 'string' ? obj.text : undefined)
)
}
function extractPostsFromJson(data: unknown, posts: ThreadsPost[]): void {
walkJson(data, (obj) => {
const text = getPostText(obj)
if (!text || text.length < 5) return
const code = (obj.code as string) ?? (obj.pk as string) ?? (obj.id as string)
const username =
(obj.user as { username?: string } | undefined)?.username ??
(obj.owner as { username?: string } | undefined)?.username
posts.push({
externalId: code ? String(code) : undefined,
text,
permalink:
code && username ? `https://www.threads.com/@${username}/post/${code}` : undefined,
authorName: username,
likeCount: (obj.like_count as number) ?? undefined,
replyCount:
(obj.text_post_app_info as { direct_reply_count?: number } | undefined)?.direct_reply_count ??
(obj.reply_count as number),
})
})
}
async function extractFromPageScripts(page: Page): Promise<ThreadsPost[]> {
const posts: ThreadsPost[] = []
const scripts = await page.locator('script[type="application/json"][data-sjs]').all()
for (const script of scripts) {
try {
const raw = await script.textContent()
if (!raw || !raw.includes('thread_items')) continue
const json = JSON.parse(raw)
extractPostsFromJson(json, posts)
} catch {
// skip malformed script
}
}
return posts
}
function attachCollector(page: Page, collected: ThreadsPost[]) {
page.on('response', async (response) => {
const url = response.url()
if (!url.includes('graphql') && !url.includes('threads') && !url.includes('instagram')) return
try {
const contentType = response.headers()['content-type'] ?? ''
if (!contentType.includes('json')) return
const json = await response.json()
extractPostsFromJson(json, collected)
} catch {
// ignore
}
})
}
async function scrapeProfileDom(page: Page, username: string): Promise<ThreadsPost[]> {
const posts: ThreadsPost[] = []
const seen = new Set<string>()
const links = page.locator('a[href*="/post/"]')
const count = await links.count()
for (let i = 0; i < Math.min(count, 35); i++) {
const link = links.nth(i)
try {
const href = await link.getAttribute('href', { timeout: 1200 })
if (!href || seen.has(href)) continue
seen.add(href)
const permalink = href.startsWith('http') ? href : `https://www.threads.com${href}`
const authorName = href.match(/@([^/]+)\/post/)?.[1] ?? username
const container = link.locator("xpath=ancestor::*[contains(@data-pressable-container,'true')][1]")
const scope =
(await container.count()) > 0 ? container : link.locator('xpath=ancestor::div[position()<=6]').first()
const text = await scope
.locator('div[dir="auto"], span[dir="auto"]')
.first()
.innerText({ timeout: 1200 })
.catch(() => '')
if (!text || text.length < 3) continue
posts.push({
text: text.trim(),
permalink,
authorName,
externalId: href.match(/\/post\/([^/?]+)/)?.[1],
})
} catch {
// skip
}
}
return posts
}
async function humanLandingPause(page: Page) {
await page.waitForTimeout(1200 + Math.floor(Math.random() * 1800))
}
async function humanScrollPage(page: Page) {
const passes = 2 + Math.floor(Math.random() * 3)
for (let i = 0; i < passes; i++) {
await page.mouse.wheel(0, 500 + Math.floor(Math.random() * 700))
await page.waitForTimeout(800 + Math.floor(Math.random() * 1200))
}
}
function dedupePosts(posts: ThreadsPost[], limit: number): ScrapedPost[] {
const seen = new Set<string>()
const out: ScrapedPost[] = []
for (const post of posts) {
const key = post.externalId ?? post.permalink ?? post.text.slice(0, 120)
if (seen.has(key)) continue
seen.add(key)
out.push({
text: post.text,
permalink: post.permalink,
likeCount: post.likeCount,
replyCount: post.replyCount,
})
if (out.length >= limit) break
}
return out
}
async function createBrowserContext(storageState: string): Promise<{ context: BrowserContext; close: () => Promise<void> }> {
const browser = await chromium.launch({
headless: process.env.PLAYWRIGHT_HEADLESS !== 'false',
args: BROWSER_ARGS,
})
let parsedState: unknown
try {
parsedState = JSON.parse(storageState)
} catch {
await browser.close()
throw new Error('瀏覽器 session 資料損毀,請到連線設定重新同步 Chrome extension')
}
const context = await browser.newContext({
storageState: parsedState as BrowserContextOptions['storageState'],
userAgent: USER_AGENT,
viewport: { width: 1280, height: 900 },
locale: 'zh-TW',
timezoneId: 'Asia/Taipei',
})
await context.addInitScript(STEALTH_INIT_SCRIPT)
return {
context,
close: async () => {
await context.close()
await browser.close()
},
}
}
/** 沿用舊版 lib/threads-browser/profile.ts 的三段式抓取GraphQL → 內嵌 JSON → DOM */
export async function getProfilePosts(
storageState: string,
username: string,
limit = 15,
): Promise<ScrapedPost[]> {
const clean = username.replace(/^@/, '').trim()
if (!storageState.trim()) {
throw new Error(
'找不到 Chrome session。請先在巡樓選定經營帳號到連線頁用擴充套件同步 Threads 登入態後再跑 8D。',
)
}
const { context, close } = await createBrowserContext(storageState)
try {
const page = await context.newPage()
const collected: ThreadsPost[] = []
attachCollector(page, collected)
const profileUrl = `https://www.threads.com/@${encodeURIComponent(clean)}`
await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 45_000 })
await humanLandingPause(page)
const bodyText = await page.locator('body').innerText()
if (bodyText.includes('走丟') || bodyText.includes('找不到')) {
return []
}
await page.waitForSelector('a[href*="/post/"]', { timeout: 10_000 }).catch(() => undefined)
await humanScrollPage(page)
if (collected.length < 2) {
collected.push(...(await extractFromPageScripts(page)))
}
if (collected.length < 2) {
collected.push(...(await scrapeProfileDom(page, clean)))
}
return dedupePosts(
collected.map((post) => ({ ...post, authorName: post.authorName ?? clean })),
limit,
)
} finally {
await close()
}
}