haixunMaster/lib/services/scan-tasks.ts

273 lines
8.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { isOnTopicTag } from "@/lib/topic-anchor";
import { isPlacementGoal } from "@/lib/types/topic-goal";
import {
isAccountTag,
normalizeUsername,
type ResearchMap,
} from "@/lib/types/research";
export type ScanTaskKind = "keyword" | "account";
export interface ScanTask {
id: string;
kind: ScanTaskKind;
query: string;
label: string;
limit: number;
}
import { getBrowserConcurrency } from "@/lib/threads-browser/human-behavior";
const MAX_CONCURRENCY = 2;
export function splitScanTasks(tasks: ScanTask[]): {
keywordTasks: ScanTask[];
accountTasks: ScanTask[];
} {
return {
keywordTasks: tasks.filter((t) => t.kind === "keyword"),
accountTasks: tasks.filter((t) => t.kind === "account"),
};
}
/** 使用者明確勾選的 @帳號(不自動補研究地圖相似帳號)。 */
export function getSelectedAccountUsernames(selectedTags: string[]): string[] {
return selectedTags
.filter(isAccountTag)
.map((tag) => normalizeUsername(tag))
.filter(Boolean);
}
export function hasSelectedAccountTags(selectedTags: string[]): boolean {
return getSelectedAccountUsernames(selectedTags).length > 0;
}
/** 有限平行 + 預設單線程,降低被 Meta 判定為機器人的風險 */
export function resolveScanConcurrency(taskCount: number): number {
const cap = Math.min(MAX_CONCURRENCY, getBrowserConcurrency());
return Math.min(Math.max(taskCount, 1), cap);
}
function addTask(
tasks: ScanTask[],
seen: Set<string>,
task: Omit<ScanTask, "id">
) {
const key = `${task.kind}:${task.query}`;
if (seen.has(key)) return;
seen.add(key);
tasks.push({ ...task, id: key });
}
const PLACEMENT_QUERY_PREFIX_RE = /^(請問|想問|有人|大家|想知道|想請教|求助|請益)/;
const PLACEMENT_QUERY_MAX = 8;
/** 將研究地圖的問題/支柱轉成 Threads 可搜尋的關鍵字 */
export function normalizePlacementSearchPhrase(text: string): string {
return text
.replace(/[?!。.、,;:""''「」【】()]/g, "")
.replace(PLACEMENT_QUERY_PREFIX_RE, "")
.replace(/\s+/g, "")
.trim()
.slice(0, 28);
}
function placementPhraseScore(phrase: string, seedQuery: string): number {
if (phrase.length < 3) return -1;
if (phrase.length > 28) return -1;
let score = phrase.length;
if (seedQuery && (phrase.includes(seedQuery) || seedQuery.includes(phrase))) score += 6;
if (/推薦|請益|求助|請問|怎麼|用什麼|哪款|困擾|煩惱|怕|癢|過敏/.test(phrase)) score += 4;
return score;
}
/** 置入模式:從受眾問題 + 內容支柱自動組搜尋詞(不需手動勾 tag */
export function resolvePlacementSearchQueries(
researchMap: ResearchMap | null | undefined,
seedQuery: string,
selectedTags: string[] = []
): string[] {
if (!researchMap) {
const q = normalizePlacementSearchPhrase(seedQuery);
return q.length >= 3 ? [q] : [];
}
const candidates: Array<{ phrase: string; score: number }> = [];
const preferredTags = selectedTags.length > 0
? selectedTags
: (researchMap.suggestedTags ?? [])
.filter((tag) => tag.searchType !== "帳號" && !isAccountTag(tag.tag))
.map((tag) => tag.tag);
for (const item of preferredTags) {
const phrase = normalizePlacementSearchPhrase(item);
const score = placementPhraseScore(phrase, seedQuery);
if (score >= 0) candidates.push({ phrase, score: score + 30 });
}
for (const item of [...(researchMap.questions ?? []), ...(researchMap.pillars ?? [])]) {
const phrase = normalizePlacementSearchPhrase(item);
const score = placementPhraseScore(phrase, seedQuery);
if (score >= 0) candidates.push({ phrase, score });
}
const seed = normalizePlacementSearchPhrase(seedQuery);
if (seed.length >= 3) {
candidates.push({ phrase: seed, score: placementPhraseScore(seed, seedQuery) + 2 });
}
const seen = new Set<string>();
const sorted = candidates
.sort((a, b) => b.score - a.score)
.filter(({ phrase }) => {
const key = phrase.toLowerCase();
if (seen.has(key)) return false;
seen.add(key);
return true;
})
.slice(0, PLACEMENT_QUERY_MAX);
return sorted.map((c) => c.phrase);
}
export function hasPlacementSearchSources(
researchMap: ResearchMap | null | undefined,
seedQuery = ""
): boolean {
return resolvePlacementSearchQueries(researchMap, seedQuery).length > 0;
}
export function buildPlacementScanTasks(params: {
researchMap?: ResearchMap | null;
seedQuery: string;
selectedTags?: string[];
}): ScanTask[] {
const queries = resolvePlacementSearchQueries(
params.researchMap,
params.seedQuery,
params.selectedTags
);
const tasks: ScanTask[] = [];
const seen = new Set<string>();
for (const query of queries) {
addTask(tasks, seen, {
kind: "keyword",
query,
label: query,
limit: 14,
});
}
return tasks;
}
export function buildScanTasks(params: {
selectedTags: string[];
researchMap?: ResearchMap | null;
seedQuery: string;
topicGoal?: string | null;
topicLabel?: string;
}): ScanTask[] {
const { selectedTags, researchMap, seedQuery, topicGoal, topicLabel = seedQuery } = params;
const placementMode = isPlacementGoal(topicGoal);
const tagMeta = new Map(
(researchMap?.suggestedTags ?? []).map((t) => [t.tag, t])
);
const tasks: ScanTask[] = [];
const seen = new Set<string>();
for (const tag of selectedTags) {
const meta = tagMeta.get(tag);
const isAccount =
isAccountTag(tag) || meta?.searchType === "帳號";
if (isAccount) {
if (placementMode) continue;
const username = normalizeUsername(tag);
const isApprovedTopicAccount =
meta?.searchType === "帳號" ||
(researchMap?.similarAccounts ?? []).some(
(account) => normalizeUsername(account.username).toLowerCase() === username.toLowerCase()
);
// 手動輸入、但未被研究地圖確認為同領域的帳號,不直接整頁爬取。
if (!isApprovedTopicAccount) continue;
addTask(tasks, seen, {
kind: "account",
query: username,
label: `@${username}`,
limit: 15,
});
continue;
}
// 研究地圖內的標籤已經過專用 AI 與自然搜尋詞驗證,不再要求字面包含完整種子詞。
// 只有不在研究地圖中的手動標籤,才套用嚴格主題錨點防止跑題。
const isApprovedResearchTag = Boolean(meta);
if (
!isAccountTag(tag) &&
!isApprovedResearchTag &&
!isOnTopicTag(tag, { label: topicLabel, query: seedQuery })
) {
continue;
}
const isShort = meta?.searchType === "短詞" || tag.length <= 4;
const isNeedTag =
meta?.searchIntent === "需求" ||
meta?.searchIntent === "求助" ||
meta?.searchIntent === "痛點";
addTask(tasks, seen, {
kind: "keyword",
query: tag,
label: tag,
limit: placementMode ? (isNeedTag ? 16 : isShort ? 14 : 12) : isShort ? 15 : 12,
});
}
return tasks;
}
export function pickDefaultSelectedTags(
researchMap: ResearchMap,
topicGoal?: string | null
): string[] {
const tags = researchMap.suggestedTags;
const shorts = tags.filter((t) => t.searchType === "短詞").map((t) => t.tag);
const quotes = tags.filter((t) => t.searchType === "語錄").map((t) => t.tag);
const scenes = tags.filter((t) => t.searchType === "情境").map((t) => t.tag);
const needTags = tags
.filter((t) => t.searchIntent === "需求" || t.searchIntent === "求助" || t.searchIntent === "痛點")
.map((t) => t.tag);
const acctTags = tags
.filter((t) => t.searchType === "帳號")
.map((t) => (t.tag.startsWith("@") ? t.tag : `@${t.tag}`));
const isPlacement = topicGoal === "placement";
const picked = isPlacement
? [
...needTags.slice(0, 6),
...scenes.slice(0, 5),
...shorts.slice(0, 4),
...quotes.slice(0, 1),
]
: [
...shorts.slice(0, 4),
...scenes.slice(0, 2),
...quotes.slice(0, 1),
...acctTags.slice(0, 1),
];
const target = isPlacement ? 10 : 8;
const balanced = [...new Set(picked)];
const fallbackOrder = [
...tags.filter((t) => t.searchType !== "帳號" && !isAccountTag(t.tag)).map((t) => t.tag),
...acctTags.slice(0, 2),
];
for (const tag of fallbackOrder) {
if (balanced.length >= target) break;
if (!balanced.includes(tag)) balanced.push(tag);
}
return balanced.slice(0, target);
}