haixunMaster/lib/services/scan-web-discover.ts

440 lines
13 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import "server-only";
import { computePlacementScore, computeScore, type RankedPost } from "@/lib/ranking";
import {
DEFAULT_WEB_SEARCH_MAX_QUERIES,
formatGoogleAfterDate,
PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS,
PLACEMENT_WEB_SEARCH_MAX_QUERIES,
PLACEMENT_WEB_SEARCH_TARGET_POSTS,
} from "@/lib/scan-recency";
import {
isBraveSearchConfigured,
type BraveWebSearchOptions,
} from "@/lib/services/web-search";
import type { KeywordPriority } from "@/lib/search";
import {
type ContentBandInput,
isInContentBand,
} from "@/lib/research-content-band";
import { hasPlacementIntent, looksLikeCasualChat } from "@/lib/topic-anchor";
import type { SearchIntent, SearchTagType } from "@/lib/types/research";
import {
isAccountTag,
normalizeThreadsPostUrl,
normalizeUsername,
type SimilarAccount,
} from "@/lib/types/research";
import { searchWebThorough } from "@/lib/services/web-search";
import { runWithConcurrency } from "@/lib/utils/concurrency";
export type ScanPostSource = "account" | "keyword" | "web";
export type WebDiscoveredPost = RankedPost & {
searchTag?: string;
scanSource?: ScanPostSource;
};
const SOURCE_PRIORITY: Record<ScanPostSource, number> = {
account: 3,
keyword: 2,
web: 1,
};
const SOURCE_SCORE_BOOST: Record<ScanPostSource, number> = {
account: 1.35,
keyword: 1,
web: 0.85,
};
function postKey(post: {
permalink?: string;
externalId?: string;
authorName?: string;
text: string;
}) {
return (
post.permalink ??
post.externalId ??
`${post.authorName ?? ""}:${post.text.trim().slice(0, 120)}`
);
}
function withSource(
post: RankedPost & { searchTag?: string },
scanSource: ScanPostSource
): WebDiscoveredPost {
return {
...post,
scanSource,
score: post.score * SOURCE_SCORE_BOOST[scanSource],
};
}
interface TagSearchMeta {
searchIntent?: SearchIntent;
searchType?: SearchTagType;
}
/** 每個關鍵字只組 1 條高訊號查詢,控制 Brave 額度 */
function buildPlacementKeywordQueries(tag: string, meta?: TagSearchMeta): string[] {
const after = formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS);
const isNeedTag =
meta?.searchIntent === "需求" ||
meta?.searchIntent === "求助" ||
meta?.searchIntent === "痛點";
const intent = isNeedTag ? "求推薦" : "請問";
return [`site:threads.net "${tag}" ${intent} after:${after}`];
}
function resolveBraveQueryCap(placementMode: boolean): number {
const raw = process.env.SCAN_BRAVE_MAX_QUERIES?.trim();
const parsed = raw ? Number.parseInt(raw, 10) : NaN;
if (Number.isFinite(parsed) && parsed > 0) {
return Math.min(parsed, placementMode ? 20 : 30);
}
return placementMode ? PLACEMENT_WEB_SEARCH_MAX_QUERIES : DEFAULT_WEB_SEARCH_MAX_QUERIES;
}
function buildKeywordQueries(
tag: string,
placementMode: boolean,
meta?: TagSearchMeta
): string[] {
if (placementMode) return buildPlacementKeywordQueries(tag, meta);
return [`site:threads.net "${tag}"`];
}
function passesPlacementWebFilter(
text: string,
placementMode: boolean,
contentBand?: ContentBandInput
): boolean {
if (!placementMode) return true;
if (looksLikeCasualChat(text)) return false;
if (!hasPlacementIntent(text)) return false;
if (contentBand && !isInContentBand(text, contentBand)) return false;
return true;
}
function scoreDiscoveredPost(
raw: {
text: string;
permalink: string;
authorName: string;
externalId: string;
postedAt?: Date;
likeCount?: number;
replyCount?: number;
},
placementMode: boolean
): number {
return placementMode ? computePlacementScore(raw) : computeScore(raw);
}
function parsePostFromUrl(
link: string,
title: string,
snippet: string,
searchTag: string,
scanSource: ScanPostSource = "web",
placementMode = false
): WebDiscoveredPost | null {
const permalink = normalizeThreadsPostUrl(link);
if (!permalink) return null;
const match = permalink.match(/threads\.com\/@([^/]+)\/post\/([^/?#]+)/i);
if (!match) return null;
const text = [title.trim(), snippet.trim()].filter(Boolean).join(" — ");
if (text.length < 8) return null;
const authorName = match[1];
const externalId = match[2];
const raw = { text, permalink, authorName, externalId };
return withSource(
{
...raw,
score: scoreDiscoveredPost(raw, placementMode),
searchTag,
},
scanSource
);
}
/** 研究地圖裡發現帳號時附帶的參考貼文,直接當高品質種子。 */
export function postFromSimilarAccountSeed(account: SimilarAccount): WebDiscoveredPost | null {
if (!account.postUrl) return null;
const permalink = normalizeThreadsPostUrl(account.postUrl);
if (!permalink) return null;
const match = permalink.match(/threads\.com\/@([^/]+)\/post\/([^/?#]+)/i);
if (!match) return null;
const username = normalizeUsername(account.username) || match[1];
const tag = `@${username}`;
const text = (account.reason || `相似帳號 @${username} 的參考貼文`).slice(0, 280);
const raw = {
text,
permalink,
authorName: username,
externalId: match[2],
};
return withSource(
{
...raw,
score: computeScore(raw) * 1.2,
searchTag: tag,
},
"account"
);
}
export interface WebDiscoverOptions {
perQueryLimit?: number;
/** 置入模式:優先找求助/求推薦貼文,並用 after: 篩近期 */
placementMode?: boolean;
/** 平行查詢數(預設置入 2、一般 2 */
concurrency?: number;
/** 標籤的 searchIntent / searchType用於組更精準的網搜查詢 */
tagMeta?: Map<string, TagSearchMeta>;
/** 置入模式:研究地圖的受眾問題/內容支柱/排除項 */
contentBand?: ContentBandInput;
/** Brave 網搜最多幾次查詢;達標或超額即停 */
maxQueries?: number;
/** 找到足夠貼文後停止 */
targetPosts?: number;
/** 0 表示不跑 Brave 網搜 */
braveQueryBudget?: number;
/** MVP僅 high priority 使用 Brave */
keywordPriority?: KeywordPriority;
onProgress?: (message: string) => void | Promise<void>;
}
function resolveBraveSearchOptions(
placementMode: boolean,
keywordPriority?: KeywordPriority
): BraveWebSearchOptions {
const priority = keywordPriority ?? (placementMode ? "high" : "medium");
return {
patrolMode: true,
priority,
threadsOnly: true,
};
}
/** 用 Brave Search 找 Threads 貼文連結Threads API爬蟲不足時的補充。 */
export async function discoverPostsViaWebSearch(
tags: string[],
options?: WebDiscoverOptions
): Promise<WebDiscoveredPost[]> {
const perQueryLimit = options?.perQueryLimit ?? 15;
const placementMode = options?.placementMode ?? false;
const contentBand = options?.contentBand;
const concurrency = options?.concurrency ?? 2;
const maxQueries =
options?.maxQueries ?? options?.braveQueryBudget ?? resolveBraveQueryCap(placementMode);
const targetPosts =
options?.targetPosts ?? (placementMode ? PLACEMENT_WEB_SEARCH_TARGET_POSTS : 30);
const onProgress = options?.onProgress;
const braveOptions = resolveBraveSearchOptions(placementMode, options?.keywordPriority);
const keywordTags = tags.filter((t) => !isAccountTag(t));
if (keywordTags.length === 0 && !contentBand) return [];
if (braveOptions.priority !== "high") {
await onProgress?.("略過 Brave 網搜(僅 high priority keyword 使用 Brave");
return [];
}
if (maxQueries <= 0) {
await onProgress?.("已略過 Brave 網搜(額度保護)");
return [];
}
type QueryJob = { tag: string; query: string };
const jobs: QueryJob[] = [];
for (const tag of keywordTags) {
const meta = options?.tagMeta?.get(tag);
for (const query of buildKeywordQueries(tag, placementMode, meta)) {
jobs.push({ tag, query });
}
}
if (placementMode && contentBand) {
const after = formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS);
const bandPhrases = [
...contentBand.questions.slice(0, 2),
...contentBand.pillars.slice(0, 1),
]
.map((p) => p.trim())
.filter((p) => p.length >= 4 && p.length <= 16);
for (const phrase of bandPhrases) {
jobs.push({
tag: phrase,
query: `site:threads.net "${phrase}" 求推薦 after:${after}`,
});
}
}
const seen = new Set<string>();
const posts: WebDiscoveredPost[] = [];
let queriesUsed = 0;
let jobIndex = 0;
const mergeFound = (found: WebDiscoveredPost[]) => {
for (const post of found) {
const key = postKey(post);
if (seen.has(key)) continue;
seen.add(key);
posts.push(post);
}
};
const runJob = async (job: QueryJob): Promise<WebDiscoveredPost[]> => {
try {
const { results } = await searchWebThorough(job.query, perQueryLimit, braveOptions);
const found: WebDiscoveredPost[] = [];
for (const item of results) {
const post = parsePostFromUrl(
item.link,
item.title,
item.snippet,
job.tag,
"web",
placementMode
);
if (post && passesPlacementWebFilter(post.text, placementMode, contentBand)) {
found.push(post);
}
}
return found;
} catch {
return [];
}
};
while (
jobIndex < jobs.length &&
queriesUsed < maxQueries &&
posts.length < targetPosts
) {
const chunk = jobs.slice(jobIndex, jobIndex + concurrency);
jobIndex += chunk.length;
queriesUsed += chunk.length;
const batches = await runWithConcurrency(chunk, runJob, {
concurrency,
staggerMs: placementMode ? [600, 1400] : [2000, 5000],
});
mergeFound(batches.flat());
await onProgress?.(
`網搜 ${queriesUsed}/${Math.min(maxQueries, jobs.length)} 次 · 已找到 ${posts.length}` +
(posts.length >= targetPosts ? "(達標,停止)" : "")
);
}
if (posts.length === 0 && !isBraveSearchConfigured()) {
await onProgress?.("未設定 BRAVE_SEARCH_API_KEY請以 Threads API瀏覽器海巡為主");
}
return posts.sort((a, b) => b.score - a.score);
}
/** 對相似帳號用 site:@username 網搜,在無法開瀏覽器時也能補到帳號向貼文。 */
export async function discoverPostsFromSimilarAccounts(
accounts: SimilarAccount[],
options?: {
perAccountLimit?: number;
placementMode?: boolean;
keywordPriority?: KeywordPriority;
}
): Promise<WebDiscoveredPost[]> {
const placementMode = options?.placementMode ?? false;
const braveOptions = resolveBraveSearchOptions(placementMode, options?.keywordPriority);
const useBrave = braveOptions.priority === "high";
const perAccountLimit = options?.perAccountLimit ?? 15;
const seen = new Set<string>();
const posts: WebDiscoveredPost[] = [];
for (const account of accounts) {
const username = normalizeUsername(account.username);
if (!username) continue;
const tag = `@${username}`;
// account.reason 是「為何推薦此帳號」,不是貼文原文;不可把它冒充成貼文顯示。
const after = placementMode ? ` after:${formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS)}` : "";
const queries = placementMode
? [
`site:threads.net/@${username} 求推薦${after}`,
`site:threads.net/@${username} 請益${after}`,
`site:threads.net/@${username}${after}`,
]
: [`site:threads.net/@${username}`];
if (!useBrave) continue;
for (const query of queries) {
try {
const { results } = await searchWebThorough(query, perAccountLimit, braveOptions);
for (const item of results) {
const post = parsePostFromUrl(
item.link,
item.title,
item.snippet,
tag,
"account",
placementMode
);
if (!post) continue;
const key = postKey(post);
if (seen.has(key)) continue;
seen.add(key);
posts.push(post);
}
} catch {
// 單一帳號失敗不阻擋整次海巡
}
}
}
return posts.sort((a, b) => b.score - a.score);
}
function pickPreferredPost(
existing: WebDiscoveredPost,
incoming: WebDiscoveredPost
): WebDiscoveredPost {
const existingPriority = SOURCE_PRIORITY[existing.scanSource ?? "keyword"];
const incomingPriority = SOURCE_PRIORITY[incoming.scanSource ?? "keyword"];
if (incomingPriority > existingPriority) return incoming;
if (incomingPriority < existingPriority) return existing;
return incoming.score >= existing.score ? incoming : existing;
}
export function mergeScanPosts(
primary: WebDiscoveredPost[],
supplemental: WebDiscoveredPost[],
max: number
): WebDiscoveredPost[] {
const byKey = new Map<string, WebDiscoveredPost>();
for (const post of [...primary, ...supplemental]) {
const key = postKey(post);
const existing = byKey.get(key);
byKey.set(key, existing ? pickPreferredPost(existing, post) : post);
}
return [...byKey.values()].sort((a, b) => b.score - a.score).slice(0, max);
}
export function tagPostSource<T extends RankedPost & { searchTag?: string }>(
post: T,
scanSource: ScanPostSource
): WebDiscoveredPost {
return withSource(post, scanSource);
}