440 lines
13 KiB
TypeScript
440 lines
13 KiB
TypeScript
import "server-only";
|
||
|
||
import { computePlacementScore, computeScore, type RankedPost } from "@/lib/ranking";
|
||
import {
|
||
DEFAULT_WEB_SEARCH_MAX_QUERIES,
|
||
formatGoogleAfterDate,
|
||
PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS,
|
||
PLACEMENT_WEB_SEARCH_MAX_QUERIES,
|
||
PLACEMENT_WEB_SEARCH_TARGET_POSTS,
|
||
} from "@/lib/scan-recency";
|
||
import {
|
||
isBraveSearchConfigured,
|
||
type BraveWebSearchOptions,
|
||
} from "@/lib/services/web-search";
|
||
import type { KeywordPriority } from "@/lib/search";
|
||
import {
|
||
type ContentBandInput,
|
||
isInContentBand,
|
||
} from "@/lib/research-content-band";
|
||
import { hasPlacementIntent, looksLikeCasualChat } from "@/lib/topic-anchor";
|
||
import type { SearchIntent, SearchTagType } from "@/lib/types/research";
|
||
import {
|
||
isAccountTag,
|
||
normalizeThreadsPostUrl,
|
||
normalizeUsername,
|
||
type SimilarAccount,
|
||
} from "@/lib/types/research";
|
||
import { searchWebThorough } from "@/lib/services/web-search";
|
||
import { runWithConcurrency } from "@/lib/utils/concurrency";
|
||
|
||
export type ScanPostSource = "account" | "keyword" | "web";
|
||
|
||
export type WebDiscoveredPost = RankedPost & {
|
||
searchTag?: string;
|
||
scanSource?: ScanPostSource;
|
||
};
|
||
|
||
const SOURCE_PRIORITY: Record<ScanPostSource, number> = {
|
||
account: 3,
|
||
keyword: 2,
|
||
web: 1,
|
||
};
|
||
|
||
const SOURCE_SCORE_BOOST: Record<ScanPostSource, number> = {
|
||
account: 1.35,
|
||
keyword: 1,
|
||
web: 0.85,
|
||
};
|
||
|
||
function postKey(post: {
|
||
permalink?: string;
|
||
externalId?: string;
|
||
authorName?: string;
|
||
text: string;
|
||
}) {
|
||
return (
|
||
post.permalink ??
|
||
post.externalId ??
|
||
`${post.authorName ?? ""}:${post.text.trim().slice(0, 120)}`
|
||
);
|
||
}
|
||
|
||
function withSource(
|
||
post: RankedPost & { searchTag?: string },
|
||
scanSource: ScanPostSource
|
||
): WebDiscoveredPost {
|
||
return {
|
||
...post,
|
||
scanSource,
|
||
score: post.score * SOURCE_SCORE_BOOST[scanSource],
|
||
};
|
||
}
|
||
|
||
interface TagSearchMeta {
|
||
searchIntent?: SearchIntent;
|
||
searchType?: SearchTagType;
|
||
}
|
||
|
||
/** 每個關鍵字只組 1 條高訊號查詢,控制 Brave 額度 */
|
||
function buildPlacementKeywordQueries(tag: string, meta?: TagSearchMeta): string[] {
|
||
const after = formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS);
|
||
const isNeedTag =
|
||
meta?.searchIntent === "需求" ||
|
||
meta?.searchIntent === "求助" ||
|
||
meta?.searchIntent === "痛點";
|
||
const intent = isNeedTag ? "求推薦" : "請問";
|
||
return [`site:threads.net "${tag}" ${intent} after:${after}`];
|
||
}
|
||
|
||
function resolveBraveQueryCap(placementMode: boolean): number {
|
||
const raw = process.env.SCAN_BRAVE_MAX_QUERIES?.trim();
|
||
const parsed = raw ? Number.parseInt(raw, 10) : NaN;
|
||
if (Number.isFinite(parsed) && parsed > 0) {
|
||
return Math.min(parsed, placementMode ? 20 : 30);
|
||
}
|
||
return placementMode ? PLACEMENT_WEB_SEARCH_MAX_QUERIES : DEFAULT_WEB_SEARCH_MAX_QUERIES;
|
||
}
|
||
|
||
function buildKeywordQueries(
|
||
tag: string,
|
||
placementMode: boolean,
|
||
meta?: TagSearchMeta
|
||
): string[] {
|
||
if (placementMode) return buildPlacementKeywordQueries(tag, meta);
|
||
return [`site:threads.net "${tag}"`];
|
||
}
|
||
|
||
function passesPlacementWebFilter(
|
||
text: string,
|
||
placementMode: boolean,
|
||
contentBand?: ContentBandInput
|
||
): boolean {
|
||
if (!placementMode) return true;
|
||
if (looksLikeCasualChat(text)) return false;
|
||
if (!hasPlacementIntent(text)) return false;
|
||
if (contentBand && !isInContentBand(text, contentBand)) return false;
|
||
return true;
|
||
}
|
||
|
||
function scoreDiscoveredPost(
|
||
raw: {
|
||
text: string;
|
||
permalink: string;
|
||
authorName: string;
|
||
externalId: string;
|
||
postedAt?: Date;
|
||
likeCount?: number;
|
||
replyCount?: number;
|
||
},
|
||
placementMode: boolean
|
||
): number {
|
||
return placementMode ? computePlacementScore(raw) : computeScore(raw);
|
||
}
|
||
|
||
function parsePostFromUrl(
|
||
link: string,
|
||
title: string,
|
||
snippet: string,
|
||
searchTag: string,
|
||
scanSource: ScanPostSource = "web",
|
||
placementMode = false
|
||
): WebDiscoveredPost | null {
|
||
const permalink = normalizeThreadsPostUrl(link);
|
||
if (!permalink) return null;
|
||
|
||
const match = permalink.match(/threads\.com\/@([^/]+)\/post\/([^/?#]+)/i);
|
||
if (!match) return null;
|
||
|
||
const text = [title.trim(), snippet.trim()].filter(Boolean).join(" — ");
|
||
if (text.length < 8) return null;
|
||
|
||
const authorName = match[1];
|
||
const externalId = match[2];
|
||
const raw = { text, permalink, authorName, externalId };
|
||
|
||
return withSource(
|
||
{
|
||
...raw,
|
||
score: scoreDiscoveredPost(raw, placementMode),
|
||
searchTag,
|
||
},
|
||
scanSource
|
||
);
|
||
}
|
||
|
||
/** 研究地圖裡發現帳號時附帶的參考貼文,直接當高品質種子。 */
|
||
export function postFromSimilarAccountSeed(account: SimilarAccount): WebDiscoveredPost | null {
|
||
if (!account.postUrl) return null;
|
||
const permalink = normalizeThreadsPostUrl(account.postUrl);
|
||
if (!permalink) return null;
|
||
|
||
const match = permalink.match(/threads\.com\/@([^/]+)\/post\/([^/?#]+)/i);
|
||
if (!match) return null;
|
||
|
||
const username = normalizeUsername(account.username) || match[1];
|
||
const tag = `@${username}`;
|
||
const text = (account.reason || `相似帳號 @${username} 的參考貼文`).slice(0, 280);
|
||
const raw = {
|
||
text,
|
||
permalink,
|
||
authorName: username,
|
||
externalId: match[2],
|
||
};
|
||
|
||
return withSource(
|
||
{
|
||
...raw,
|
||
score: computeScore(raw) * 1.2,
|
||
searchTag: tag,
|
||
},
|
||
"account"
|
||
);
|
||
}
|
||
|
||
export interface WebDiscoverOptions {
|
||
perQueryLimit?: number;
|
||
/** 置入模式:優先找求助/求推薦貼文,並用 after: 篩近期 */
|
||
placementMode?: boolean;
|
||
/** 平行查詢數(預設置入 2、一般 2) */
|
||
concurrency?: number;
|
||
/** 標籤的 searchIntent / searchType,用於組更精準的網搜查詢 */
|
||
tagMeta?: Map<string, TagSearchMeta>;
|
||
/** 置入模式:研究地圖的受眾問題/內容支柱/排除項 */
|
||
contentBand?: ContentBandInput;
|
||
/** Brave 網搜最多幾次查詢;達標或超額即停 */
|
||
maxQueries?: number;
|
||
/** 找到足夠貼文後停止 */
|
||
targetPosts?: number;
|
||
/** 0 表示不跑 Brave 網搜 */
|
||
braveQueryBudget?: number;
|
||
/** MVP:僅 high priority 使用 Brave */
|
||
keywordPriority?: KeywordPriority;
|
||
onProgress?: (message: string) => void | Promise<void>;
|
||
}
|
||
|
||
function resolveBraveSearchOptions(
|
||
placementMode: boolean,
|
||
keywordPriority?: KeywordPriority
|
||
): BraveWebSearchOptions {
|
||
const priority = keywordPriority ?? (placementMode ? "high" : "medium");
|
||
return {
|
||
patrolMode: true,
|
||
priority,
|
||
threadsOnly: true,
|
||
};
|
||
}
|
||
|
||
/** 用 Brave Search 找 Threads 貼文連結(Threads API/爬蟲不足時的補充)。 */
|
||
export async function discoverPostsViaWebSearch(
|
||
tags: string[],
|
||
options?: WebDiscoverOptions
|
||
): Promise<WebDiscoveredPost[]> {
|
||
const perQueryLimit = options?.perQueryLimit ?? 15;
|
||
const placementMode = options?.placementMode ?? false;
|
||
const contentBand = options?.contentBand;
|
||
const concurrency = options?.concurrency ?? 2;
|
||
const maxQueries =
|
||
options?.maxQueries ?? options?.braveQueryBudget ?? resolveBraveQueryCap(placementMode);
|
||
const targetPosts =
|
||
options?.targetPosts ?? (placementMode ? PLACEMENT_WEB_SEARCH_TARGET_POSTS : 30);
|
||
const onProgress = options?.onProgress;
|
||
const braveOptions = resolveBraveSearchOptions(placementMode, options?.keywordPriority);
|
||
const keywordTags = tags.filter((t) => !isAccountTag(t));
|
||
if (keywordTags.length === 0 && !contentBand) return [];
|
||
|
||
if (braveOptions.priority !== "high") {
|
||
await onProgress?.("略過 Brave 網搜(僅 high priority keyword 使用 Brave)");
|
||
return [];
|
||
}
|
||
|
||
if (maxQueries <= 0) {
|
||
await onProgress?.("已略過 Brave 網搜(額度保護)");
|
||
return [];
|
||
}
|
||
|
||
type QueryJob = { tag: string; query: string };
|
||
const jobs: QueryJob[] = [];
|
||
for (const tag of keywordTags) {
|
||
const meta = options?.tagMeta?.get(tag);
|
||
for (const query of buildKeywordQueries(tag, placementMode, meta)) {
|
||
jobs.push({ tag, query });
|
||
}
|
||
}
|
||
|
||
if (placementMode && contentBand) {
|
||
const after = formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS);
|
||
const bandPhrases = [
|
||
...contentBand.questions.slice(0, 2),
|
||
...contentBand.pillars.slice(0, 1),
|
||
]
|
||
.map((p) => p.trim())
|
||
.filter((p) => p.length >= 4 && p.length <= 16);
|
||
for (const phrase of bandPhrases) {
|
||
jobs.push({
|
||
tag: phrase,
|
||
query: `site:threads.net "${phrase}" 求推薦 after:${after}`,
|
||
});
|
||
}
|
||
}
|
||
|
||
const seen = new Set<string>();
|
||
const posts: WebDiscoveredPost[] = [];
|
||
let queriesUsed = 0;
|
||
let jobIndex = 0;
|
||
|
||
const mergeFound = (found: WebDiscoveredPost[]) => {
|
||
for (const post of found) {
|
||
const key = postKey(post);
|
||
if (seen.has(key)) continue;
|
||
seen.add(key);
|
||
posts.push(post);
|
||
}
|
||
};
|
||
|
||
const runJob = async (job: QueryJob): Promise<WebDiscoveredPost[]> => {
|
||
try {
|
||
const { results } = await searchWebThorough(job.query, perQueryLimit, braveOptions);
|
||
const found: WebDiscoveredPost[] = [];
|
||
for (const item of results) {
|
||
const post = parsePostFromUrl(
|
||
item.link,
|
||
item.title,
|
||
item.snippet,
|
||
job.tag,
|
||
"web",
|
||
placementMode
|
||
);
|
||
if (post && passesPlacementWebFilter(post.text, placementMode, contentBand)) {
|
||
found.push(post);
|
||
}
|
||
}
|
||
return found;
|
||
} catch {
|
||
return [];
|
||
}
|
||
};
|
||
|
||
while (
|
||
jobIndex < jobs.length &&
|
||
queriesUsed < maxQueries &&
|
||
posts.length < targetPosts
|
||
) {
|
||
const chunk = jobs.slice(jobIndex, jobIndex + concurrency);
|
||
jobIndex += chunk.length;
|
||
queriesUsed += chunk.length;
|
||
|
||
const batches = await runWithConcurrency(chunk, runJob, {
|
||
concurrency,
|
||
staggerMs: placementMode ? [600, 1400] : [2000, 5000],
|
||
});
|
||
mergeFound(batches.flat());
|
||
|
||
await onProgress?.(
|
||
`網搜 ${queriesUsed}/${Math.min(maxQueries, jobs.length)} 次 · 已找到 ${posts.length} 篇` +
|
||
(posts.length >= targetPosts ? "(達標,停止)" : "")
|
||
);
|
||
}
|
||
|
||
if (posts.length === 0 && !isBraveSearchConfigured()) {
|
||
await onProgress?.("未設定 BRAVE_SEARCH_API_KEY,請以 Threads API/瀏覽器海巡為主");
|
||
}
|
||
|
||
return posts.sort((a, b) => b.score - a.score);
|
||
}
|
||
|
||
/** 對相似帳號用 site:@username 網搜,在無法開瀏覽器時也能補到帳號向貼文。 */
|
||
export async function discoverPostsFromSimilarAccounts(
|
||
accounts: SimilarAccount[],
|
||
options?: {
|
||
perAccountLimit?: number;
|
||
placementMode?: boolean;
|
||
keywordPriority?: KeywordPriority;
|
||
}
|
||
): Promise<WebDiscoveredPost[]> {
|
||
const placementMode = options?.placementMode ?? false;
|
||
const braveOptions = resolveBraveSearchOptions(placementMode, options?.keywordPriority);
|
||
const useBrave = braveOptions.priority === "high";
|
||
const perAccountLimit = options?.perAccountLimit ?? 15;
|
||
const seen = new Set<string>();
|
||
const posts: WebDiscoveredPost[] = [];
|
||
|
||
for (const account of accounts) {
|
||
const username = normalizeUsername(account.username);
|
||
if (!username) continue;
|
||
const tag = `@${username}`;
|
||
|
||
// account.reason 是「為何推薦此帳號」,不是貼文原文;不可把它冒充成貼文顯示。
|
||
|
||
const after = placementMode ? ` after:${formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS)}` : "";
|
||
const queries = placementMode
|
||
? [
|
||
`site:threads.net/@${username} 求推薦${after}`,
|
||
`site:threads.net/@${username} 請益${after}`,
|
||
`site:threads.net/@${username}${after}`,
|
||
]
|
||
: [`site:threads.net/@${username}`];
|
||
|
||
if (!useBrave) continue;
|
||
|
||
for (const query of queries) {
|
||
try {
|
||
const { results } = await searchWebThorough(query, perAccountLimit, braveOptions);
|
||
for (const item of results) {
|
||
const post = parsePostFromUrl(
|
||
item.link,
|
||
item.title,
|
||
item.snippet,
|
||
tag,
|
||
"account",
|
||
placementMode
|
||
);
|
||
if (!post) continue;
|
||
const key = postKey(post);
|
||
if (seen.has(key)) continue;
|
||
seen.add(key);
|
||
posts.push(post);
|
||
}
|
||
} catch {
|
||
// 單一帳號失敗不阻擋整次海巡
|
||
}
|
||
}
|
||
}
|
||
|
||
return posts.sort((a, b) => b.score - a.score);
|
||
}
|
||
|
||
function pickPreferredPost(
|
||
existing: WebDiscoveredPost,
|
||
incoming: WebDiscoveredPost
|
||
): WebDiscoveredPost {
|
||
const existingPriority = SOURCE_PRIORITY[existing.scanSource ?? "keyword"];
|
||
const incomingPriority = SOURCE_PRIORITY[incoming.scanSource ?? "keyword"];
|
||
if (incomingPriority > existingPriority) return incoming;
|
||
if (incomingPriority < existingPriority) return existing;
|
||
return incoming.score >= existing.score ? incoming : existing;
|
||
}
|
||
|
||
export function mergeScanPosts(
|
||
primary: WebDiscoveredPost[],
|
||
supplemental: WebDiscoveredPost[],
|
||
max: number
|
||
): WebDiscoveredPost[] {
|
||
const byKey = new Map<string, WebDiscoveredPost>();
|
||
|
||
for (const post of [...primary, ...supplemental]) {
|
||
const key = postKey(post);
|
||
const existing = byKey.get(key);
|
||
byKey.set(key, existing ? pickPreferredPost(existing, post) : post);
|
||
}
|
||
|
||
return [...byKey.values()].sort((a, b) => b.score - a.score).slice(0, max);
|
||
}
|
||
|
||
export function tagPostSource<T extends RankedPost & { searchTag?: string }>(
|
||
post: T,
|
||
scanSource: ScanPostSource
|
||
): WebDiscoveredPost {
|
||
return withSource(post, scanSource);
|
||
}
|