import { humanDelay } from "@/lib/utils"; import { withPage } from "./browser"; import { browserSessionOptions, type ActiveSession } from "./session"; import type { MediaType } from "@/lib/types/viral"; function extractUrlsFromJson(data: unknown, urls: Set): void { if (!data || typeof data !== "object") return; if (Array.isArray(data)) { for (const item of data) extractUrlsFromJson(item, urls); return; } const obj = data as Record; if (typeof obj.url === "string" && isMediaUrl(obj.url)) { urls.add(obj.url); } if (typeof obj.display_url === "string" && isMediaUrl(obj.display_url)) { urls.add(obj.display_url); } for (const value of Object.values(obj)) { if (value && typeof value === "object") extractUrlsFromJson(value, urls); } } function isMediaUrl(url: string): boolean { return ( (url.includes("cdninstagram.com") || url.includes("fbcdn.net") || url.includes("threads.net")) && !url.includes("profile") && (url.includes(".jpg") || url.includes(".jpeg") || url.includes(".png") || url.includes(".webp") || url.includes("stp=")) ); } function inferMediaType(urls: string[]): MediaType { if (urls.length === 0) return "text-only"; if (urls.length === 1) return "single-image"; if (urls.length > 1) return "carousel"; return "mixed"; } export async function scrapePostMedia( storageState: string, permalink: string, session?: ActiveSession ): Promise<{ urls: string[]; mediaType: MediaType }> { return withPage(storageState, async (page) => { const collected = new Set(); page.on("response", async (response) => { const url = response.url(); if (!url.includes("graphql") && !url.includes("threads") && !url.includes("instagram")) { return; } try { const contentType = response.headers()["content-type"] ?? ""; if (!contentType.includes("json")) return; const json = await response.json(); extractUrlsFromJson(json, collected); } catch { // ignore } }); await page.goto(permalink, { waitUntil: "networkidle", timeout: 60000 }); await humanDelay(1500, 2500); const imgs = page.locator( 'article img[src*="cdninstagram"], article img[src*="fbcdn"], [role="presentation"] img[src*="cdninstagram"]' ); const count = await imgs.count(); for (let i = 0; i < Math.min(count, 10); i++) { try { const src = await imgs.nth(i).getAttribute("src", { timeout: 2000 }); if (src && isMediaUrl(src)) collected.add(src); } catch { // skip } } const scripts = await page.locator('script[type="application/json"][data-sjs]').all(); for (const script of scripts) { try { const raw = await script.textContent(); if (!raw) continue; extractUrlsFromJson(JSON.parse(raw), collected); } catch { // skip } } const urls = Array.from(collected).slice(0, 8); return { urls, mediaType: inferMediaType(urls) }; }, session ? browserSessionOptions(session) : undefined); }