441 lines
13 KiB
TypeScript
441 lines
13 KiB
TypeScript
|
|
import "server-only";
|
|||
|
|
|
|||
|
|
import { computePlacementScore, computeScore, type RankedPost } from "@/lib/ranking";
|
|||
|
|
import {
|
|||
|
|
DEFAULT_WEB_SEARCH_MAX_QUERIES,
|
|||
|
|
formatGoogleAfterDate,
|
|||
|
|
PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS,
|
|||
|
|
PLACEMENT_WEB_SEARCH_MAX_QUERIES,
|
|||
|
|
PLACEMENT_WEB_SEARCH_TARGET_POSTS,
|
|||
|
|
} from "@/lib/scan-recency";
|
|||
|
|
import {
|
|||
|
|
isBraveSearchConfigured,
|
|||
|
|
type BraveWebSearchOptions,
|
|||
|
|
} from "@/lib/services/web-search";
|
|||
|
|
import type { KeywordPriority } from "@/lib/search";
|
|||
|
|
import {
|
|||
|
|
type ContentBandInput,
|
|||
|
|
isInContentBand,
|
|||
|
|
} from "@/lib/research-content-band";
|
|||
|
|
import { hasPlacementIntent, looksLikeCasualChat } from "@/lib/topic-anchor";
|
|||
|
|
import type { SearchIntent, SearchTagType } from "@/lib/types/research";
|
|||
|
|
import {
|
|||
|
|
isAccountTag,
|
|||
|
|
normalizeThreadsPostUrl,
|
|||
|
|
normalizeUsername,
|
|||
|
|
type SimilarAccount,
|
|||
|
|
} from "@/lib/types/research";
|
|||
|
|
import { searchWebThorough } from "@/lib/services/web-search";
|
|||
|
|
import { runWithConcurrency } from "@/lib/utils/concurrency";
|
|||
|
|
|
|||
|
|
export type ScanPostSource = "account" | "keyword" | "web";
|
|||
|
|
|
|||
|
|
export type WebDiscoveredPost = RankedPost & {
|
|||
|
|
searchTag?: string;
|
|||
|
|
scanSource?: ScanPostSource;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const SOURCE_PRIORITY: Record<ScanPostSource, number> = {
|
|||
|
|
account: 3,
|
|||
|
|
keyword: 2,
|
|||
|
|
web: 1,
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const SOURCE_SCORE_BOOST: Record<ScanPostSource, number> = {
|
|||
|
|
account: 1.35,
|
|||
|
|
keyword: 1,
|
|||
|
|
web: 0.85,
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
function postKey(post: {
|
|||
|
|
permalink?: string;
|
|||
|
|
externalId?: string;
|
|||
|
|
authorName?: string;
|
|||
|
|
text: string;
|
|||
|
|
}) {
|
|||
|
|
return (
|
|||
|
|
post.permalink ??
|
|||
|
|
post.externalId ??
|
|||
|
|
`${post.authorName ?? ""}:${post.text.trim().slice(0, 120)}`
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function withSource(
|
|||
|
|
post: RankedPost & { searchTag?: string },
|
|||
|
|
scanSource: ScanPostSource
|
|||
|
|
): WebDiscoveredPost {
|
|||
|
|
return {
|
|||
|
|
...post,
|
|||
|
|
scanSource,
|
|||
|
|
score: post.score * SOURCE_SCORE_BOOST[scanSource],
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
interface TagSearchMeta {
|
|||
|
|
searchIntent?: SearchIntent;
|
|||
|
|
searchType?: SearchTagType;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** 每個關鍵字只組 1 條高訊號查詢,控制 Brave 額度 */
|
|||
|
|
function buildPlacementKeywordQueries(tag: string, meta?: TagSearchMeta): string[] {
|
|||
|
|
const after = formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS);
|
|||
|
|
const isNeedTag =
|
|||
|
|
meta?.searchIntent === "需求" ||
|
|||
|
|
meta?.searchIntent === "求助" ||
|
|||
|
|
meta?.searchIntent === "痛點";
|
|||
|
|
const intent = isNeedTag ? "求推薦" : "請問";
|
|||
|
|
return [`site:threads.com "${tag}" ${intent} after:${after}`];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function resolveBraveQueryCap(placementMode: boolean): number {
|
|||
|
|
const raw = process.env.SCAN_BRAVE_MAX_QUERIES?.trim();
|
|||
|
|
const parsed = raw ? Number.parseInt(raw, 10) : NaN;
|
|||
|
|
if (Number.isFinite(parsed) && parsed > 0) {
|
|||
|
|
return Math.min(parsed, placementMode ? 20 : 30);
|
|||
|
|
}
|
|||
|
|
return placementMode ? PLACEMENT_WEB_SEARCH_MAX_QUERIES : DEFAULT_WEB_SEARCH_MAX_QUERIES;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function buildKeywordQueries(
|
|||
|
|
tag: string,
|
|||
|
|
placementMode: boolean,
|
|||
|
|
meta?: TagSearchMeta
|
|||
|
|
): string[] {
|
|||
|
|
if (placementMode) return buildPlacementKeywordQueries(tag, meta);
|
|||
|
|
return [`site:threads.com "${tag}"`, `site:threads.net "${tag}"`];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function passesPlacementWebFilter(
|
|||
|
|
text: string,
|
|||
|
|
placementMode: boolean,
|
|||
|
|
contentBand?: ContentBandInput
|
|||
|
|
): boolean {
|
|||
|
|
if (!placementMode) return true;
|
|||
|
|
if (looksLikeCasualChat(text)) return false;
|
|||
|
|
if (!hasPlacementIntent(text)) return false;
|
|||
|
|
if (contentBand && !isInContentBand(text, contentBand)) return false;
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function scoreDiscoveredPost(
|
|||
|
|
raw: {
|
|||
|
|
text: string;
|
|||
|
|
permalink: string;
|
|||
|
|
authorName: string;
|
|||
|
|
externalId: string;
|
|||
|
|
postedAt?: Date;
|
|||
|
|
likeCount?: number;
|
|||
|
|
replyCount?: number;
|
|||
|
|
},
|
|||
|
|
placementMode: boolean
|
|||
|
|
): number {
|
|||
|
|
return placementMode ? computePlacementScore(raw) : computeScore(raw);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function parsePostFromUrl(
|
|||
|
|
link: string,
|
|||
|
|
title: string,
|
|||
|
|
snippet: string,
|
|||
|
|
searchTag: string,
|
|||
|
|
scanSource: ScanPostSource = "web",
|
|||
|
|
placementMode = false
|
|||
|
|
): WebDiscoveredPost | null {
|
|||
|
|
const permalink = normalizeThreadsPostUrl(link);
|
|||
|
|
if (!permalink) return null;
|
|||
|
|
|
|||
|
|
const match = permalink.match(/threads\.com\/@([^/]+)\/post\/([^/?#]+)/i);
|
|||
|
|
if (!match) return null;
|
|||
|
|
|
|||
|
|
const text = [title.trim(), snippet.trim()].filter(Boolean).join(" — ");
|
|||
|
|
if (text.length < 8) return null;
|
|||
|
|
|
|||
|
|
const authorName = match[1];
|
|||
|
|
const externalId = match[2];
|
|||
|
|
const raw = { text, permalink, authorName, externalId };
|
|||
|
|
|
|||
|
|
return withSource(
|
|||
|
|
{
|
|||
|
|
...raw,
|
|||
|
|
score: scoreDiscoveredPost(raw, placementMode),
|
|||
|
|
searchTag,
|
|||
|
|
},
|
|||
|
|
scanSource
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** 研究地圖裡發現帳號時附帶的參考貼文,直接當高品質種子。 */
|
|||
|
|
export function postFromSimilarAccountSeed(account: SimilarAccount): WebDiscoveredPost | null {
|
|||
|
|
if (!account.postUrl) return null;
|
|||
|
|
const permalink = normalizeThreadsPostUrl(account.postUrl);
|
|||
|
|
if (!permalink) return null;
|
|||
|
|
|
|||
|
|
const match = permalink.match(/threads\.com\/@([^/]+)\/post\/([^/?#]+)/i);
|
|||
|
|
if (!match) return null;
|
|||
|
|
|
|||
|
|
const username = normalizeUsername(account.username) || match[1];
|
|||
|
|
const tag = `@${username}`;
|
|||
|
|
const text = (account.reason || `相似帳號 @${username} 的參考貼文`).slice(0, 280);
|
|||
|
|
const raw = {
|
|||
|
|
text,
|
|||
|
|
permalink,
|
|||
|
|
authorName: username,
|
|||
|
|
externalId: match[2],
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
return withSource(
|
|||
|
|
{
|
|||
|
|
...raw,
|
|||
|
|
score: computeScore(raw) * 1.2,
|
|||
|
|
searchTag: tag,
|
|||
|
|
},
|
|||
|
|
"account"
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export interface WebDiscoverOptions {
|
|||
|
|
perQueryLimit?: number;
|
|||
|
|
/** 置入模式:優先找求助/求推薦貼文,並用 after: 篩近期 */
|
|||
|
|
placementMode?: boolean;
|
|||
|
|
/** 平行查詢數(預設置入 2、一般 2) */
|
|||
|
|
concurrency?: number;
|
|||
|
|
/** 標籤的 searchIntent / searchType,用於組更精準的網搜查詢 */
|
|||
|
|
tagMeta?: Map<string, TagSearchMeta>;
|
|||
|
|
/** 置入模式:研究地圖的受眾問題/內容支柱/排除項 */
|
|||
|
|
contentBand?: ContentBandInput;
|
|||
|
|
/** Brave 網搜最多幾次查詢;達標或超額即停 */
|
|||
|
|
maxQueries?: number;
|
|||
|
|
/** 找到足夠貼文後停止 */
|
|||
|
|
targetPosts?: number;
|
|||
|
|
/** 0 表示不跑 Brave 網搜 */
|
|||
|
|
braveQueryBudget?: number;
|
|||
|
|
/** MVP:僅 high priority 使用 Brave */
|
|||
|
|
keywordPriority?: KeywordPriority;
|
|||
|
|
onProgress?: (message: string) => void | Promise<void>;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function resolveBraveSearchOptions(
|
|||
|
|
placementMode: boolean,
|
|||
|
|
keywordPriority?: KeywordPriority
|
|||
|
|
): BraveWebSearchOptions {
|
|||
|
|
const priority = keywordPriority ?? (placementMode ? "high" : "medium");
|
|||
|
|
return {
|
|||
|
|
patrolMode: true,
|
|||
|
|
priority,
|
|||
|
|
threadsOnly: true,
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** 用 Brave Search 找 Threads 貼文連結(Threads API/爬蟲不足時的補充)。 */
|
|||
|
|
export async function discoverPostsViaWebSearch(
|
|||
|
|
tags: string[],
|
|||
|
|
options?: WebDiscoverOptions
|
|||
|
|
): Promise<WebDiscoveredPost[]> {
|
|||
|
|
const perQueryLimit = options?.perQueryLimit ?? 10;
|
|||
|
|
const placementMode = options?.placementMode ?? false;
|
|||
|
|
const contentBand = options?.contentBand;
|
|||
|
|
const concurrency = options?.concurrency ?? 2;
|
|||
|
|
const maxQueries =
|
|||
|
|
options?.maxQueries ?? options?.braveQueryBudget ?? resolveBraveQueryCap(placementMode);
|
|||
|
|
const targetPosts =
|
|||
|
|
options?.targetPosts ?? (placementMode ? PLACEMENT_WEB_SEARCH_TARGET_POSTS : 30);
|
|||
|
|
const onProgress = options?.onProgress;
|
|||
|
|
const braveOptions = resolveBraveSearchOptions(placementMode, options?.keywordPriority);
|
|||
|
|
const keywordTags = tags.filter((t) => !isAccountTag(t));
|
|||
|
|
if (keywordTags.length === 0 && !contentBand) return [];
|
|||
|
|
|
|||
|
|
if (braveOptions.priority !== "high") {
|
|||
|
|
await onProgress?.("略過 Brave 網搜(僅 high priority keyword 使用 Brave)");
|
|||
|
|
return [];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (maxQueries <= 0) {
|
|||
|
|
await onProgress?.("已略過 Brave 網搜(額度保護)");
|
|||
|
|
return [];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type QueryJob = { tag: string; query: string };
|
|||
|
|
const jobs: QueryJob[] = [];
|
|||
|
|
for (const tag of keywordTags) {
|
|||
|
|
const meta = options?.tagMeta?.get(tag);
|
|||
|
|
for (const query of buildKeywordQueries(tag, placementMode, meta)) {
|
|||
|
|
jobs.push({ tag, query });
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (placementMode && contentBand) {
|
|||
|
|
const after = formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS);
|
|||
|
|
const bandPhrases = [
|
|||
|
|
...contentBand.questions.slice(0, 2),
|
|||
|
|
...contentBand.pillars.slice(0, 1),
|
|||
|
|
]
|
|||
|
|
.map((p) => p.trim())
|
|||
|
|
.filter((p) => p.length >= 4 && p.length <= 16);
|
|||
|
|
for (const phrase of bandPhrases) {
|
|||
|
|
jobs.push({
|
|||
|
|
tag: phrase,
|
|||
|
|
query: `site:threads.com "${phrase}" 求推薦 after:${after}`,
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const seen = new Set<string>();
|
|||
|
|
const posts: WebDiscoveredPost[] = [];
|
|||
|
|
let queriesUsed = 0;
|
|||
|
|
let jobIndex = 0;
|
|||
|
|
|
|||
|
|
const mergeFound = (found: WebDiscoveredPost[]) => {
|
|||
|
|
for (const post of found) {
|
|||
|
|
const key = postKey(post);
|
|||
|
|
if (seen.has(key)) continue;
|
|||
|
|
seen.add(key);
|
|||
|
|
posts.push(post);
|
|||
|
|
}
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const runJob = async (job: QueryJob): Promise<WebDiscoveredPost[]> => {
|
|||
|
|
try {
|
|||
|
|
const { results } = await searchWebThorough(job.query, perQueryLimit, braveOptions);
|
|||
|
|
const found: WebDiscoveredPost[] = [];
|
|||
|
|
for (const item of results) {
|
|||
|
|
const post = parsePostFromUrl(
|
|||
|
|
item.link,
|
|||
|
|
item.title,
|
|||
|
|
item.snippet,
|
|||
|
|
job.tag,
|
|||
|
|
"web",
|
|||
|
|
placementMode
|
|||
|
|
);
|
|||
|
|
if (post && passesPlacementWebFilter(post.text, placementMode, contentBand)) {
|
|||
|
|
found.push(post);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return found;
|
|||
|
|
} catch {
|
|||
|
|
return [];
|
|||
|
|
}
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
while (
|
|||
|
|
jobIndex < jobs.length &&
|
|||
|
|
queriesUsed < maxQueries &&
|
|||
|
|
posts.length < targetPosts
|
|||
|
|
) {
|
|||
|
|
const chunk = jobs.slice(jobIndex, jobIndex + concurrency);
|
|||
|
|
jobIndex += chunk.length;
|
|||
|
|
queriesUsed += chunk.length;
|
|||
|
|
|
|||
|
|
const batches = await runWithConcurrency(chunk, runJob, {
|
|||
|
|
concurrency,
|
|||
|
|
staggerMs: placementMode ? [600, 1400] : [2000, 5000],
|
|||
|
|
});
|
|||
|
|
mergeFound(batches.flat());
|
|||
|
|
|
|||
|
|
await onProgress?.(
|
|||
|
|
`網搜 ${queriesUsed}/${Math.min(maxQueries, jobs.length)} 次 · 已找到 ${posts.length} 篇` +
|
|||
|
|
(posts.length >= targetPosts ? "(達標,停止)" : "")
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (posts.length === 0 && !isBraveSearchConfigured()) {
|
|||
|
|
await onProgress?.("未設定 BRAVE_SEARCH_API_KEY,請以 Threads API/瀏覽器海巡為主");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return posts.sort((a, b) => b.score - a.score);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** 對相似帳號用 site:@username 網搜,在無法開瀏覽器時也能補到帳號向貼文。 */
|
|||
|
|
export async function discoverPostsFromSimilarAccounts(
|
|||
|
|
accounts: SimilarAccount[],
|
|||
|
|
options?: {
|
|||
|
|
perAccountLimit?: number;
|
|||
|
|
placementMode?: boolean;
|
|||
|
|
keywordPriority?: KeywordPriority;
|
|||
|
|
}
|
|||
|
|
): Promise<WebDiscoveredPost[]> {
|
|||
|
|
const placementMode = options?.placementMode ?? false;
|
|||
|
|
const braveOptions = resolveBraveSearchOptions(placementMode, options?.keywordPriority);
|
|||
|
|
const useBrave = braveOptions.priority === "high";
|
|||
|
|
const perAccountLimit = options?.perAccountLimit ?? 10;
|
|||
|
|
const seen = new Set<string>();
|
|||
|
|
const posts: WebDiscoveredPost[] = [];
|
|||
|
|
|
|||
|
|
for (const account of accounts) {
|
|||
|
|
const username = normalizeUsername(account.username);
|
|||
|
|
if (!username) continue;
|
|||
|
|
const tag = `@${username}`;
|
|||
|
|
|
|||
|
|
// account.reason 是「為何推薦此帳號」,不是貼文原文;不可把它冒充成貼文顯示。
|
|||
|
|
|
|||
|
|
const after = placementMode ? ` after:${formatGoogleAfterDate(PLACEMENT_WEB_SEARCH_MAX_AGE_DAYS)}` : "";
|
|||
|
|
const queries = placementMode
|
|||
|
|
? [
|
|||
|
|
`site:threads.com/@${username} 求推薦${after}`,
|
|||
|
|
`site:threads.com/@${username} 請益${after}`,
|
|||
|
|
`site:threads.com/@${username}${after}`,
|
|||
|
|
`site:threads.net/@${username}${after}`,
|
|||
|
|
]
|
|||
|
|
: [`site:threads.com/@${username}`, `site:threads.net/@${username}`];
|
|||
|
|
|
|||
|
|
if (!useBrave) continue;
|
|||
|
|
|
|||
|
|
for (const query of queries) {
|
|||
|
|
try {
|
|||
|
|
const { results } = await searchWebThorough(query, perAccountLimit, braveOptions);
|
|||
|
|
for (const item of results) {
|
|||
|
|
const post = parsePostFromUrl(
|
|||
|
|
item.link,
|
|||
|
|
item.title,
|
|||
|
|
item.snippet,
|
|||
|
|
tag,
|
|||
|
|
"account",
|
|||
|
|
placementMode
|
|||
|
|
);
|
|||
|
|
if (!post) continue;
|
|||
|
|
const key = postKey(post);
|
|||
|
|
if (seen.has(key)) continue;
|
|||
|
|
seen.add(key);
|
|||
|
|
posts.push(post);
|
|||
|
|
}
|
|||
|
|
} catch {
|
|||
|
|
// 單一帳號失敗不阻擋整次海巡
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return posts.sort((a, b) => b.score - a.score);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function pickPreferredPost(
|
|||
|
|
existing: WebDiscoveredPost,
|
|||
|
|
incoming: WebDiscoveredPost
|
|||
|
|
): WebDiscoveredPost {
|
|||
|
|
const existingPriority = SOURCE_PRIORITY[existing.scanSource ?? "keyword"];
|
|||
|
|
const incomingPriority = SOURCE_PRIORITY[incoming.scanSource ?? "keyword"];
|
|||
|
|
if (incomingPriority > existingPriority) return incoming;
|
|||
|
|
if (incomingPriority < existingPriority) return existing;
|
|||
|
|
return incoming.score >= existing.score ? incoming : existing;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function mergeScanPosts(
|
|||
|
|
primary: WebDiscoveredPost[],
|
|||
|
|
supplemental: WebDiscoveredPost[],
|
|||
|
|
max: number
|
|||
|
|
): WebDiscoveredPost[] {
|
|||
|
|
const byKey = new Map<string, WebDiscoveredPost>();
|
|||
|
|
|
|||
|
|
for (const post of [...primary, ...supplemental]) {
|
|||
|
|
const key = postKey(post);
|
|||
|
|
const existing = byKey.get(key);
|
|||
|
|
byKey.set(key, existing ? pickPreferredPost(existing, post) : post);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return [...byKey.values()].sort((a, b) => b.score - a.score).slice(0, max);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function tagPostSource<T extends RankedPost & { searchTag?: string }>(
|
|||
|
|
post: T,
|
|||
|
|
scanSource: ScanPostSource
|
|||
|
|
): WebDiscoveredPost {
|
|||
|
|
return withSource(post, scanSource);
|
|||
|
|
}
|