haixunMaster/lib/services/scan-web-discover.ts

441 lines
13 KiB
TypeScript
Raw Normal View History

2026-06-21 12:50:31 +00:00
import "server-only";
import { computePlacementScore, computeScore, type RankedPost } from "@/lib/ranking";
import {
DEFAULT_WEB_SEARCH_MAX_QUERIES,
formatGoogleAfterDate,
PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS,
PLACEMENT_WEB_SEARCH_MAX_QUERIES,
PLACEMENT_WEB_SEARCH_TARGET_POSTS,
} from "@/lib/scan-recency";
import {
isBraveSearchConfigured,
type BraveWebSearchOptions,
} from "@/lib/services/web-search";
import type { KeywordPriority } from "@/lib/search";
import {
type ContentBandInput,
isInContentBand,
} from "@/lib/research-content-band";
import { hasPlacementIntent, looksLikeCasualChat } from "@/lib/topic-anchor";
import type { SearchIntent, SearchTagType } from "@/lib/types/research";
import {
isAccountTag,
normalizeThreadsPostUrl,
normalizeUsername,
type SimilarAccount,
} from "@/lib/types/research";
import { searchWebThorough } from "@/lib/services/web-search";
import { runWithConcurrency } from "@/lib/utils/concurrency";
export type ScanPostSource = "account" | "keyword" | "web";
export type WebDiscoveredPost = RankedPost & {
searchTag?: string;
scanSource?: ScanPostSource;
};
const SOURCE_PRIORITY: Record<ScanPostSource, number> = {
account: 3,
keyword: 2,
web: 1,
};
const SOURCE_SCORE_BOOST: Record<ScanPostSource, number> = {
account: 1.35,
keyword: 1,
web: 0.85,
};
function postKey(post: {
permalink?: string;
externalId?: string;
authorName?: string;
text: string;
}) {
return (
post.permalink ??
post.externalId ??
`${post.authorName ?? ""}:${post.text.trim().slice(0, 120)}`
);
}
function withSource(
post: RankedPost & { searchTag?: string },
scanSource: ScanPostSource
): WebDiscoveredPost {
return {
...post,
scanSource,
score: post.score * SOURCE_SCORE_BOOST[scanSource],
};
}
interface TagSearchMeta {
searchIntent?: SearchIntent;
searchType?: SearchTagType;
}
/** 每個關鍵字只組 1 條高訊號查詢,控制 Brave 額度 */
function buildPlacementKeywordQueries(tag: string, meta?: TagSearchMeta): string[] {
const after = formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS);
const isNeedTag =
meta?.searchIntent === "需求" ||
meta?.searchIntent === "求助" ||
meta?.searchIntent === "痛點";
const intent = isNeedTag ? "求推薦" : "請問";
return [`site:threads.com "${tag}" ${intent} after:${after}`];
}
function resolveBraveQueryCap(placementMode: boolean): number {
const raw = process.env.SCAN_BRAVE_MAX_QUERIES?.trim();
const parsed = raw ? Number.parseInt(raw, 10) : NaN;
if (Number.isFinite(parsed) && parsed > 0) {
return Math.min(parsed, placementMode ? 20 : 30);
}
return placementMode ? PLACEMENT_WEB_SEARCH_MAX_QUERIES : DEFAULT_WEB_SEARCH_MAX_QUERIES;
}
function buildKeywordQueries(
tag: string,
placementMode: boolean,
meta?: TagSearchMeta
): string[] {
if (placementMode) return buildPlacementKeywordQueries(tag, meta);
return [`site:threads.com "${tag}"`, `site:threads.net "${tag}"`];
}
function passesPlacementWebFilter(
text: string,
placementMode: boolean,
contentBand?: ContentBandInput
): boolean {
if (!placementMode) return true;
if (looksLikeCasualChat(text)) return false;
if (!hasPlacementIntent(text)) return false;
if (contentBand && !isInContentBand(text, contentBand)) return false;
return true;
}
function scoreDiscoveredPost(
raw: {
text: string;
permalink: string;
authorName: string;
externalId: string;
postedAt?: Date;
likeCount?: number;
replyCount?: number;
},
placementMode: boolean
): number {
return placementMode ? computePlacementScore(raw) : computeScore(raw);
}
function parsePostFromUrl(
link: string,
title: string,
snippet: string,
searchTag: string,
scanSource: ScanPostSource = "web",
placementMode = false
): WebDiscoveredPost | null {
const permalink = normalizeThreadsPostUrl(link);
if (!permalink) return null;
const match = permalink.match(/threads\.com\/@([^/]+)\/post\/([^/?#]+)/i);
if (!match) return null;
const text = [title.trim(), snippet.trim()].filter(Boolean).join(" — ");
if (text.length < 8) return null;
const authorName = match[1];
const externalId = match[2];
const raw = { text, permalink, authorName, externalId };
return withSource(
{
...raw,
score: scoreDiscoveredPost(raw, placementMode),
searchTag,
},
scanSource
);
}
/** 研究地圖裡發現帳號時附帶的參考貼文,直接當高品質種子。 */
export function postFromSimilarAccountSeed(account: SimilarAccount): WebDiscoveredPost | null {
if (!account.postUrl) return null;
const permalink = normalizeThreadsPostUrl(account.postUrl);
if (!permalink) return null;
const match = permalink.match(/threads\.com\/@([^/]+)\/post\/([^/?#]+)/i);
if (!match) return null;
const username = normalizeUsername(account.username) || match[1];
const tag = `@${username}`;
const text = (account.reason || `相似帳號 @${username} 的參考貼文`).slice(0, 280);
const raw = {
text,
permalink,
authorName: username,
externalId: match[2],
};
return withSource(
{
...raw,
score: computeScore(raw) * 1.2,
searchTag: tag,
},
"account"
);
}
export interface WebDiscoverOptions {
perQueryLimit?: number;
/** 置入模式:優先找求助/求推薦貼文,並用 after: 篩近期 */
placementMode?: boolean;
/** 平行查詢數(預設置入 2、一般 2 */
concurrency?: number;
/** 標籤的 searchIntent / searchType用於組更精準的網搜查詢 */
tagMeta?: Map<string, TagSearchMeta>;
/** 置入模式:研究地圖的受眾問題/內容支柱/排除項 */
contentBand?: ContentBandInput;
/** Brave 網搜最多幾次查詢;達標或超額即停 */
maxQueries?: number;
/** 找到足夠貼文後停止 */
targetPosts?: number;
/** 0 表示不跑 Brave 網搜 */
braveQueryBudget?: number;
/** MVP僅 high priority 使用 Brave */
keywordPriority?: KeywordPriority;
onProgress?: (message: string) => void | Promise<void>;
}
function resolveBraveSearchOptions(
placementMode: boolean,
keywordPriority?: KeywordPriority
): BraveWebSearchOptions {
const priority = keywordPriority ?? (placementMode ? "high" : "medium");
return {
patrolMode: true,
priority,
threadsOnly: true,
};
}
/** 用 Brave Search 找 Threads 貼文連結Threads API爬蟲不足時的補充。 */
export async function discoverPostsViaWebSearch(
tags: string[],
options?: WebDiscoverOptions
): Promise<WebDiscoveredPost[]> {
const perQueryLimit = options?.perQueryLimit ?? 10;
const placementMode = options?.placementMode ?? false;
const contentBand = options?.contentBand;
const concurrency = options?.concurrency ?? 2;
const maxQueries =
options?.maxQueries ?? options?.braveQueryBudget ?? resolveBraveQueryCap(placementMode);
const targetPosts =
options?.targetPosts ?? (placementMode ? PLACEMENT_WEB_SEARCH_TARGET_POSTS : 30);
const onProgress = options?.onProgress;
const braveOptions = resolveBraveSearchOptions(placementMode, options?.keywordPriority);
const keywordTags = tags.filter((t) => !isAccountTag(t));
if (keywordTags.length === 0 && !contentBand) return [];
if (braveOptions.priority !== "high") {
await onProgress?.("略過 Brave 網搜(僅 high priority keyword 使用 Brave");
return [];
}
if (maxQueries <= 0) {
await onProgress?.("已略過 Brave 網搜(額度保護)");
return [];
}
type QueryJob = { tag: string; query: string };
const jobs: QueryJob[] = [];
for (const tag of keywordTags) {
const meta = options?.tagMeta?.get(tag);
for (const query of buildKeywordQueries(tag, placementMode, meta)) {
jobs.push({ tag, query });
}
}
if (placementMode && contentBand) {
const after = formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS);
const bandPhrases = [
...contentBand.questions.slice(0, 2),
...contentBand.pillars.slice(0, 1),
]
.map((p) => p.trim())
.filter((p) => p.length >= 4 && p.length <= 16);
for (const phrase of bandPhrases) {
jobs.push({
tag: phrase,
query: `site:threads.com "${phrase}" 求推薦 after:${after}`,
});
}
}
const seen = new Set<string>();
const posts: WebDiscoveredPost[] = [];
let queriesUsed = 0;
let jobIndex = 0;
const mergeFound = (found: WebDiscoveredPost[]) => {
for (const post of found) {
const key = postKey(post);
if (seen.has(key)) continue;
seen.add(key);
posts.push(post);
}
};
const runJob = async (job: QueryJob): Promise<WebDiscoveredPost[]> => {
try {
const { results } = await searchWebThorough(job.query, perQueryLimit, braveOptions);
const found: WebDiscoveredPost[] = [];
for (const item of results) {
const post = parsePostFromUrl(
item.link,
item.title,
item.snippet,
job.tag,
"web",
placementMode
);
if (post && passesPlacementWebFilter(post.text, placementMode, contentBand)) {
found.push(post);
}
}
return found;
} catch {
return [];
}
};
while (
jobIndex < jobs.length &&
queriesUsed < maxQueries &&
posts.length < targetPosts
) {
const chunk = jobs.slice(jobIndex, jobIndex + concurrency);
jobIndex += chunk.length;
queriesUsed += chunk.length;
const batches = await runWithConcurrency(chunk, runJob, {
concurrency,
staggerMs: placementMode ? [600, 1400] : [2000, 5000],
});
mergeFound(batches.flat());
await onProgress?.(
`網搜 ${queriesUsed}/${Math.min(maxQueries, jobs.length)} 次 · 已找到 ${posts.length}` +
(posts.length >= targetPosts ? "(達標,停止)" : "")
);
}
if (posts.length === 0 && !isBraveSearchConfigured()) {
await onProgress?.("未設定 BRAVE_SEARCH_API_KEY請以 Threads API瀏覽器海巡為主");
}
return posts.sort((a, b) => b.score - a.score);
}
/** 對相似帳號用 site:@username 網搜,在無法開瀏覽器時也能補到帳號向貼文。 */
export async function discoverPostsFromSimilarAccounts(
accounts: SimilarAccount[],
options?: {
perAccountLimit?: number;
placementMode?: boolean;
keywordPriority?: KeywordPriority;
}
): Promise<WebDiscoveredPost[]> {
const placementMode = options?.placementMode ?? false;
const braveOptions = resolveBraveSearchOptions(placementMode, options?.keywordPriority);
const useBrave = braveOptions.priority === "high";
const perAccountLimit = options?.perAccountLimit ?? 10;
const seen = new Set<string>();
const posts: WebDiscoveredPost[] = [];
for (const account of accounts) {
const username = normalizeUsername(account.username);
if (!username) continue;
const tag = `@${username}`;
// account.reason 是「為何推薦此帳號」,不是貼文原文;不可把它冒充成貼文顯示。
const after = placementMode ? ` after:${formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS)}` : "";
const queries = placementMode
? [
`site:threads.com/@${username} 求推薦${after}`,
`site:threads.com/@${username} 請益${after}`,
`site:threads.com/@${username}${after}`,
`site:threads.net/@${username}${after}`,
]
: [`site:threads.com/@${username}`, `site:threads.net/@${username}`];
if (!useBrave) continue;
for (const query of queries) {
try {
const { results } = await searchWebThorough(query, perAccountLimit, braveOptions);
for (const item of results) {
const post = parsePostFromUrl(
item.link,
item.title,
item.snippet,
tag,
"account",
placementMode
);
if (!post) continue;
const key = postKey(post);
if (seen.has(key)) continue;
seen.add(key);
posts.push(post);
}
} catch {
// 單一帳號失敗不阻擋整次海巡
}
}
}
return posts.sort((a, b) => b.score - a.score);
}
function pickPreferredPost(
existing: WebDiscoveredPost,
incoming: WebDiscoveredPost
): WebDiscoveredPost {
const existingPriority = SOURCE_PRIORITY[existing.scanSource ?? "keyword"];
const incomingPriority = SOURCE_PRIORITY[incoming.scanSource ?? "keyword"];
if (incomingPriority > existingPriority) return incoming;
if (incomingPriority < existingPriority) return existing;
return incoming.score >= existing.score ? incoming : existing;
}
export function mergeScanPosts(
primary: WebDiscoveredPost[],
supplemental: WebDiscoveredPost[],
max: number
): WebDiscoveredPost[] {
const byKey = new Map<string, WebDiscoveredPost>();
for (const post of [...primary, ...supplemental]) {
const key = postKey(post);
const existing = byKey.get(key);
byKey.set(key, existing ? pickPreferredPost(existing, post) : post);
}
return [...byKey.values()].sort((a, b) => b.score - a.score).slice(0, max);
}
export function tagPostSource<T extends RankedPost & { searchTag?: string }>(
post: T,
scanSource: ScanPostSource
): WebDiscoveredPost {
return withSource(post, scanSource);
}