haixunMaster/lib/search/dedupe.ts

61 lines
1.7 KiB
TypeScript

import crypto from "crypto";
import { cacheGet, cacheSet } from "./cache";
import type { SearchResult } from "./types";
const DEDUPE_NS = "dedupe";
export function normalizeThreadsUrl(url: string): string {
try {
const u = new URL(url);
u.hostname = u.hostname.replace(/^www\./, "");
u.search = "";
u.hash = "";
const pathname = u.pathname.replace(/\/+$/, "") || "/";
u.pathname = pathname;
return u.toString();
} catch {
return url.trim();
}
}
export function extractThreadId(url: string): string | undefined {
const m = url.match(/threads\.com\/@[^/]+\/post\/([^/?#]+)/i);
return m?.[1];
}
export function dedupeKeyForResult(result: SearchResult): string {
const threadId = result.threadId ?? extractThreadId(result.url);
if (threadId) return `thread:${threadId}`;
const normalized = normalizeThreadsUrl(result.url);
const hash = crypto.createHash("sha256").update(normalized).digest("hex").slice(0, 16);
return `url_hash:${hash}`;
}
export function dedupeResults(results: SearchResult[]): SearchResult[] {
const seen = new Set<string>();
const out: SearchResult[] = [];
for (const r of results) {
const key = dedupeKeyForResult(r);
if (seen.has(key)) continue;
seen.add(key);
out.push(r);
}
return out;
}
export function isNotifyDuplicate(key: string): boolean {
return cacheGet<boolean>(DEDUPE_NS, `notify:${key}`) === true;
}
export function markNotified(key: string, ttlMs: number): void {
cacheSet(DEDUPE_NS, `notify:${key}`, true, ttlMs);
}
export function filterNotifyDuplicates(results: SearchResult[]): SearchResult[] {
return results.filter((r) => {
const key = dedupeKeyForResult(r);
if (isNotifyDuplicate(key)) return false;
return true;
});
}