haixunMaster/lib/services/discover-accounts.ts

438 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import {
filterDiscoverItemsWithAi,
type DiscoverFilterItem,
} from "@/lib/ai/filter-discover-relevance";
import type { ProviderApiKeys } from "@/lib/ai/keys";
import { searchWebThorough } from "@/lib/services/web-search";
import { search } from "@/lib/threads-browser/search";
import {
BROAD_TAG_BLOCKLIST,
buildTopicAnchor,
scoreTopicRelevance,
type TopicAnchor,
} from "@/lib/topic-anchor";
import {
normalizeThreadsPostUrl,
normalizeUsername,
threadsProfileUrl,
type AccountConfidence,
type SimilarAccount,
} from "@/lib/types/research";
const THREADS_PROFILE_RE =
/(?:https?:)?\/\/(?:www\.)?threads\.(?:com|net)\/@([a-zA-Z0-9._]+)/gi;
const RESERVED_USERNAMES = new Set([
"login",
"signup",
"search",
"explore",
"home",
"help",
"about",
"privacy",
"terms",
"settings",
"accounts",
"direct",
"reels",
"stories",
"legal",
"web",
"www",
"intent",
"share",
"threads",
"thread",
"instagram",
"meta",
]);
interface DiscoverAnchor extends TopicAnchor {
pillars: string[];
specificTags: string[];
}
interface AccountCandidate {
username: string;
score: number;
relevance: number;
aiScore?: number;
aiReason?: string;
reason: string;
source: SimilarAccount["source"];
postUrl?: string;
tags?: string[];
}
interface DiscoverContext {
label: string;
query: string;
brief?: string | null;
productContext?: string | null;
pillars?: string[];
suggestedTags?: string[];
exclusions?: string[];
}
function isValidUsername(username: string): boolean {
const clean = normalizeUsername(username);
if (!clean || clean.length < 2 || clean.length > 30) return false;
if (RESERVED_USERNAMES.has(clean.toLowerCase())) return false;
if (/^(creator_\d+|example_.*)$/i.test(clean)) return false;
return /^[a-zA-Z0-9._]+$/.test(clean);
}
function buildDiscoverAnchor(ctx: DiscoverContext): DiscoverAnchor {
const base = buildTopicAnchor(ctx);
const pillars = (ctx.pillars ?? []).map((p) => p.trim()).filter(Boolean);
const tagCandidates = (ctx.suggestedTags ?? [])
.map((t) => t.replace(/^@/, "").trim())
.filter((t) => t.length >= 3);
const specificTags = tagCandidates.filter((tag) => {
if (tag.length < 4 || BROAD_TAG_BLOCKLIST.has(tag)) return false;
if (tag.includes(base.corePhrase) || base.corePhrase.includes(tag)) return true;
if (base.requiredConcepts.length >= 2) {
const matched = base.requiredConcepts.filter((c) => tag.includes(c));
return matched.length >= 2;
}
return scoreTopicRelevance(tag, base) >= 6;
});
return { ...base, pillars, specificTags };
}
function assignConfidence(candidate: AccountCandidate): AccountConfidence {
if (candidate.score > 20 && (candidate.source === "threads" || candidate.source === "scan")) {
return "high";
}
if (candidate.score > 10 || (candidate.aiScore ?? 0) > 0.5) {
return "medium";
}
return "low";
}
function extractTagsFromText(text: string): string[] {
const found = new Set<string>();
for (const match of text.match(/#[\w\u4e00-\u9fff]{2,24}/g) ?? []) {
found.add(match.replace(/^#/, ""));
}
return Array.from(found).slice(0, 6);
}
function extractUsernamesFromText(text: string): string[] {
const found: string[] = [];
let match: RegExpExecArray | null;
const re = new RegExp(THREADS_PROFILE_RE.source, THREADS_PROFILE_RE.flags);
while ((match = re.exec(text)) !== null) {
found.push(match[1]);
}
return found;
}
function addCandidate(
map: Map<string, AccountCandidate>,
username: string,
params: {
reason: string;
source: SimilarAccount["source"];
weight?: number;
postUrl?: string;
tags?: string[];
anchor: TopicAnchor;
}
) {
const clean = normalizeUsername(username);
if (!isValidUsername(clean)) return;
const relevance = scoreTopicRelevance(params.reason, params.anchor);
const key = clean.toLowerCase();
const existing = map.get(key);
const weight = params.weight ?? 1;
const postUrl = params.postUrl ? normalizeThreadsPostUrl(params.postUrl) ?? undefined : undefined;
if (existing) {
existing.score += weight;
if (relevance > existing.relevance) {
existing.relevance = relevance;
existing.reason = params.reason;
}
if (postUrl) existing.postUrl = postUrl;
return;
}
map.set(key, {
username: clean,
score: weight,
relevance,
reason: params.reason,
source: params.source,
postUrl,
tags: params.tags,
});
}
function buildWebSearchQueries(anchor: DiscoverAnchor, brief?: string | null): string[] {
const quoted = `"${anchor.corePhrase}"`;
const queries = [
`site:threads.net ${quoted}`,
`threads ${quoted} 帳號`,
`threads ${quoted} 創作者`,
`site:threads.net ${quoted} 推薦`,
`site:threads.net ${quoted} 心得`,
`${quoted} 創作者 threads`,
`"${anchor.corePhrase}" site:threads.net`,
];
const briefHint = brief?.trim().slice(0, 24) ?? "";
if (briefHint.length >= 4 && scoreTopicRelevance(briefHint, anchor) >= 6) {
queries.push(`threads "${anchor.corePhrase}" ${briefHint}`);
}
for (const pillar of anchor.pillars.slice(0, 2)) {
if (pillar.length >= 4 && scoreTopicRelevance(pillar, anchor) >= 6) {
queries.push(`site:threads.net "${pillar}"`);
queries.push(`threads "${pillar}" 推薦`);
}
}
return [...new Set(queries)];
}
function buildThreadsSearchQueries(anchor: DiscoverAnchor): string[] {
const queries: string[] = [];
if (anchor.corePhrase.length >= 4) {
queries.push(anchor.corePhrase);
}
for (const tag of anchor.specificTags) {
if (tag.length >= 4 && tag.length <= 14 && scoreTopicRelevance(tag, anchor) >= 6) {
queries.push(tag);
}
}
return [...new Set(queries)].slice(0, 3);
}
async function discoverFromWebSearch(
anchor: DiscoverAnchor,
brief?: string | null
): Promise<AccountCandidate[]> {
const map = new Map<string, AccountCandidate>();
const queries = buildWebSearchQueries(anchor, brief);
const perQueryLimit = 15;
const results = await Promise.all(
queries.map((q) =>
searchWebThorough(q, perQueryLimit, {
patrolMode: true,
priority: "high",
threadsOnly: true,
}).catch(() => ({ results: [] }))
)
);
for (const batch of results) {
for (const item of batch.results) {
const blob = `${item.link} ${item.title} ${item.snippet}`;
const relevance = scoreTopicRelevance(blob, anchor);
const tags = extractTagsFromText(blob);
for (const username of extractUsernamesFromText(blob)) {
const reason =
item.snippet.trim() ||
item.title.trim() ||
`在「${anchor.corePhrase}」相關網路搜尋結果中找到`;
const postUrl = normalizeThreadsPostUrl(item.link) ?? undefined;
addCandidate(map, username, {
reason: reason.slice(0, 160),
source: "web",
weight:
(item.link.includes(`/@${username}`) ? 3 : 2) +
relevance / 4 +
(item.provider === "brave" ? 0.5 : 0),
postUrl,
tags,
anchor,
});
}
}
}
return Array.from(map.values());
}
async function discoverFromThreadsSearch(
storageState: string,
anchor: DiscoverAnchor,
limit = 5
): Promise<AccountCandidate[]> {
const map = new Map<string, AccountCandidate>();
const queries = buildThreadsSearchQueries(anchor);
for (const q of queries) {
try {
const posts = await search(storageState, q, 15);
for (const post of posts) {
if (!post.authorName) continue;
const postText = post.text.trim();
const relevance = scoreTopicRelevance(postText, anchor);
addCandidate(map, post.authorName, {
reason:
postText.slice(0, 100) ||
`在 Threads 搜尋「${q}」的熱門貼文中出現`,
source: "threads",
weight: 2 + relevance / 2 + Math.min((post.likeCount ?? 0) / 100, 2),
postUrl: post.permalink,
anchor,
});
}
} catch {
continue;
}
}
return Array.from(map.values())
.sort((a, b) => b.score + b.relevance - (a.score + a.relevance))
.slice(0, limit);
}
function rankCandidates(candidates: AccountCandidate[]): AccountCandidate[] {
return candidates.sort(
(a, b) =>
b.score +
b.relevance * 2 +
(b.aiScore ?? 0) * 3 -
(a.score + a.relevance * 2 + (a.aiScore ?? 0) * 3)
);
}
async function applyAiRelevanceFilter(
candidates: AccountCandidate[],
ctx: DiscoverContext,
anchor: DiscoverAnchor,
ai?: {
aiProvider: string;
aiModel: string;
apiKeys?: ProviderApiKeys;
}
): Promise<AccountCandidate[]> {
if (!ai || candidates.length === 0) return candidates;
const items: DiscoverFilterItem[] = candidates.slice(0, 18).map((c) => ({
id: c.username.toLowerCase(),
text: c.reason,
username: c.username,
source: c.source ?? "web",
tags: c.tags,
}));
const verdicts = await filterDiscoverItemsWithAi({
label: ctx.label,
query: ctx.query,
brief: ctx.brief,
exclusions: ctx.exclusions,
pillars: ctx.pillars,
requiredConcepts: anchor.requiredConcepts,
items,
aiProvider: ai.aiProvider,
aiModel: ai.aiModel,
apiKeys: ai.apiKeys,
});
return candidates.map((c) => {
const verdict = verdicts.get(c.username.toLowerCase());
if (!verdict) return c;
return {
...c,
aiScore: verdict.score,
aiReason: verdict.reason,
relevance: verdict.relevant
? c.relevance + Math.round(verdict.score * 4)
: c.relevance,
};
});
}
function verifyAccountConsistency(
candidates: AccountCandidate[]
): AccountCandidate[] {
return candidates;
}
function toSimilarAccounts(candidates: AccountCandidate[], limit: number): SimilarAccount[] {
return candidates.slice(0, limit).map((c) => ({
username: c.username,
reason: c.aiReason ? `${c.reason}AI${c.aiReason}` : c.reason,
source: c.source,
profileUrl: threadsProfileUrl(c.username) ?? undefined,
postUrl: c.postUrl,
confidence: assignConfidence(c),
}));
}
export async function discoverSimilarAccounts(params: {
label: string;
query: string;
brief?: string | null;
productContext?: string | null;
pillars?: string[];
suggestedTags?: string[];
exclusions?: string[];
storageState?: string | null;
limit?: number;
aiProvider?: string;
aiModel?: string;
apiKeys?: ProviderApiKeys;
}): Promise<SimilarAccount[]> {
const limit = params.limit ?? 8;
const anchor = buildDiscoverAnchor(params);
const merged = new Map<string, AccountCandidate>();
if (params.storageState) {
const threadsCandidates = await discoverFromThreadsSearch(
params.storageState,
anchor,
limit
);
for (const c of threadsCandidates) {
merged.set(c.username.toLowerCase(), c);
}
}
const webCandidates = await discoverFromWebSearch(anchor, params.brief);
for (const c of webCandidates) {
const key = c.username.toLowerCase();
const existing = merged.get(key);
if (existing) {
existing.score += c.score;
existing.relevance = Math.max(existing.relevance, c.relevance);
if (c.postUrl) existing.postUrl = c.postUrl;
if (c.reason.length > existing.reason.length) existing.reason = c.reason;
} else {
merged.set(key, c);
}
}
let sorted = rankCandidates(Array.from(merged.values()));
if (params.aiProvider && params.aiModel) {
sorted = await applyAiRelevanceFilter(sorted, params, anchor, {
aiProvider: params.aiProvider,
aiModel: params.aiModel,
apiKeys: params.apiKeys,
});
sorted = rankCandidates(sorted);
}
sorted = verifyAccountConsistency(sorted);
sorted = rankCandidates(sorted);
return toSimilarAccounts(sorted, limit);
}