98 lines
3.0 KiB
TypeScript
98 lines
3.0 KiB
TypeScript
import { humanDelay } from "@/lib/utils";
|
|
import { withPage } from "./browser";
|
|
import { browserSessionOptions, type ActiveSession } from "./session";
|
|
import type { MediaType } from "@/lib/types/viral";
|
|
|
|
function extractUrlsFromJson(data: unknown, urls: Set<string>): void {
|
|
if (!data || typeof data !== "object") return;
|
|
|
|
if (Array.isArray(data)) {
|
|
for (const item of data) extractUrlsFromJson(item, urls);
|
|
return;
|
|
}
|
|
|
|
const obj = data as Record<string, unknown>;
|
|
|
|
if (typeof obj.url === "string" && isMediaUrl(obj.url)) {
|
|
urls.add(obj.url);
|
|
}
|
|
if (typeof obj.display_url === "string" && isMediaUrl(obj.display_url)) {
|
|
urls.add(obj.display_url);
|
|
}
|
|
|
|
for (const value of Object.values(obj)) {
|
|
if (value && typeof value === "object") extractUrlsFromJson(value, urls);
|
|
}
|
|
}
|
|
|
|
function isMediaUrl(url: string): boolean {
|
|
return (
|
|
(url.includes("cdninstagram.com") ||
|
|
url.includes("fbcdn.net") ||
|
|
url.includes("threads.net")) &&
|
|
!url.includes("profile") &&
|
|
(url.includes(".jpg") || url.includes(".jpeg") || url.includes(".png") || url.includes(".webp") || url.includes("stp="))
|
|
);
|
|
}
|
|
|
|
function inferMediaType(urls: string[]): MediaType {
|
|
if (urls.length === 0) return "text-only";
|
|
if (urls.length === 1) return "single-image";
|
|
if (urls.length > 1) return "carousel";
|
|
return "mixed";
|
|
}
|
|
|
|
export async function scrapePostMedia(
|
|
storageState: string,
|
|
permalink: string,
|
|
session?: ActiveSession
|
|
): Promise<{ urls: string[]; mediaType: MediaType }> {
|
|
return withPage(storageState, async (page) => {
|
|
const collected = new Set<string>();
|
|
|
|
page.on("response", async (response) => {
|
|
const url = response.url();
|
|
if (!url.includes("graphql") && !url.includes("threads") && !url.includes("instagram")) {
|
|
return;
|
|
}
|
|
try {
|
|
const contentType = response.headers()["content-type"] ?? "";
|
|
if (!contentType.includes("json")) return;
|
|
const json = await response.json();
|
|
extractUrlsFromJson(json, collected);
|
|
} catch {
|
|
// ignore
|
|
}
|
|
});
|
|
|
|
await page.goto(permalink, { waitUntil: "networkidle", timeout: 60000 });
|
|
await humanDelay(1500, 2500);
|
|
|
|
const imgs = page.locator(
|
|
'article img[src*="cdninstagram"], article img[src*="fbcdn"], [role="presentation"] img[src*="cdninstagram"]'
|
|
);
|
|
const count = await imgs.count();
|
|
for (let i = 0; i < Math.min(count, 10); i++) {
|
|
try {
|
|
const src = await imgs.nth(i).getAttribute("src", { timeout: 2000 });
|
|
if (src && isMediaUrl(src)) collected.add(src);
|
|
} catch {
|
|
// skip
|
|
}
|
|
}
|
|
|
|
const scripts = await page.locator('script[type="application/json"][data-sjs]').all();
|
|
for (const script of scripts) {
|
|
try {
|
|
const raw = await script.textContent();
|
|
if (!raw) continue;
|
|
extractUrlsFromJson(JSON.parse(raw), collected);
|
|
} catch {
|
|
// skip
|
|
}
|
|
}
|
|
|
|
const urls = Array.from(collected).slice(0, 8);
|
|
return { urls, mediaType: inferMediaType(urls) };
|
|
}, session ? browserSessionOptions(session) : undefined);
|
|
} |