haixunMaster/lib/threads-browser/extract.ts

93 lines
3.1 KiB
TypeScript

import type { ThreadsPost, ThreadsReply } from "./types";
function walkJson(data: unknown, visit: (obj: Record<string, unknown>) => void): void {
if (!data || typeof data !== "object") return;
if (Array.isArray(data)) {
for (const item of data) walkJson(item, visit);
return;
}
const obj = data as Record<string, unknown>;
visit(obj);
for (const value of Object.values(obj)) {
if (value && typeof value === "object") walkJson(value, visit);
}
}
function getPostText(obj: Record<string, unknown>): string | undefined {
return (
(obj.caption as { text?: string } | undefined)?.text ??
(obj.text_post_app_info as { text?: string } | undefined)?.text ??
(typeof obj.text === "string" ? obj.text : undefined)
);
}
export function extractPostsFromJson(data: unknown, posts: ThreadsPost[]): void {
walkJson(data, (obj) => {
const text = getPostText(obj);
if (!text || text.length < 5) return;
const code = (obj.code as string) ?? (obj.pk as string) ?? (obj.id as string);
const username =
(obj.user as { username?: string } | undefined)?.username ??
(obj.owner as { username?: string } | undefined)?.username;
posts.push({
externalId: code ? String(code) : undefined,
text,
permalink:
code && username ? `https://www.threads.com/@${username}/post/${code}` : undefined,
authorName: username,
likeCount: (obj.like_count as number) ?? undefined,
replyCount:
(obj.text_post_app_info as { direct_reply_count?: number } | undefined)
?.direct_reply_count ?? (obj.reply_count as number),
postedAt: obj.taken_at ? new Date((obj.taken_at as number) * 1000) : undefined,
});
});
}
export function extractRepliesFromJson(data: unknown, replies: ThreadsReply[]): void {
walkJson(data, (obj) => {
const text = getPostText(obj);
if (!text || text.length < 2) return;
const isReply =
obj.reply_to_author !== undefined ||
(obj.text_post_app_info as { reply_to_author?: unknown } | undefined)?.reply_to_author !==
undefined;
if (!isReply && obj.reply_count !== undefined) return;
replies.push({
text,
authorName:
(obj.user as { username?: string } | undefined)?.username ??
(obj.owner as { username?: string } | undefined)?.username,
likeCount: (obj.like_count as number) ?? 0,
postedAt: obj.taken_at ? new Date((obj.taken_at as number) * 1000) : undefined,
});
});
}
export async function extractFromPageScripts(page: import("playwright").Page): Promise<{
posts: ThreadsPost[];
replies: ThreadsReply[];
}> {
const posts: ThreadsPost[] = [];
const replies: ThreadsReply[] = [];
const scripts = await page.locator('script[type="application/json"][data-sjs]').all();
for (const script of scripts) {
try {
const raw = await script.textContent();
if (!raw || !raw.includes("thread_items")) continue;
const json = JSON.parse(raw);
extractPostsFromJson(json, posts);
extractRepliesFromJson(json, replies);
} catch {
// skip malformed script
}
}
return { posts, replies };
}