haixunMaster/lib/threads-browser/media.ts

98 lines
3.0 KiB
TypeScript
Raw Permalink Normal View History

2026-06-21 12:50:31 +00:00
import { humanDelay } from "@/lib/utils";
import { withPage } from "./browser";
import { browserSessionOptions, type ActiveSession } from "./session";
import type { MediaType } from "@/lib/types/viral";
function extractUrlsFromJson(data: unknown, urls: Set<string>): void {
if (!data || typeof data !== "object") return;
if (Array.isArray(data)) {
for (const item of data) extractUrlsFromJson(item, urls);
return;
}
const obj = data as Record<string, unknown>;
if (typeof obj.url === "string" && isMediaUrl(obj.url)) {
urls.add(obj.url);
}
if (typeof obj.display_url === "string" && isMediaUrl(obj.display_url)) {
urls.add(obj.display_url);
}
for (const value of Object.values(obj)) {
if (value && typeof value === "object") extractUrlsFromJson(value, urls);
}
}
function isMediaUrl(url: string): boolean {
return (
(url.includes("cdninstagram.com") ||
url.includes("fbcdn.net") ||
url.includes("threads.net")) &&
!url.includes("profile") &&
(url.includes(".jpg") || url.includes(".jpeg") || url.includes(".png") || url.includes(".webp") || url.includes("stp="))
);
}
function inferMediaType(urls: string[]): MediaType {
if (urls.length === 0) return "text-only";
if (urls.length === 1) return "single-image";
if (urls.length > 1) return "carousel";
return "mixed";
}
export async function scrapePostMedia(
storageState: string,
permalink: string,
session?: ActiveSession
): Promise<{ urls: string[]; mediaType: MediaType }> {
return withPage(storageState, async (page) => {
const collected = new Set<string>();
page.on("response", async (response) => {
const url = response.url();
if (!url.includes("graphql") && !url.includes("threads") && !url.includes("instagram")) {
return;
}
try {
const contentType = response.headers()["content-type"] ?? "";
if (!contentType.includes("json")) return;
const json = await response.json();
extractUrlsFromJson(json, collected);
} catch {
// ignore
}
});
await page.goto(permalink, { waitUntil: "networkidle", timeout: 60000 });
await humanDelay(1500, 2500);
const imgs = page.locator(
'article img[src*="cdninstagram"], article img[src*="fbcdn"], [role="presentation"] img[src*="cdninstagram"]'
);
const count = await imgs.count();
for (let i = 0; i < Math.min(count, 10); i++) {
try {
const src = await imgs.nth(i).getAttribute("src", { timeout: 2000 });
if (src && isMediaUrl(src)) collected.add(src);
} catch {
// skip
}
}
const scripts = await page.locator('script[type="application/json"][data-sjs]').all();
for (const script of scripts) {
try {
const raw = await script.textContent();
if (!raw) continue;
extractUrlsFromJson(JSON.parse(raw), collected);
} catch {
// skip
}
}
const urls = Array.from(collected).slice(0, 8);
return { urls, mediaType: inferMediaType(urls) };
}, session ? browserSessionOptions(session) : undefined);
}