haixunMaster/haixun-backend/internal/library/placement/dual_track.go

369 lines
9.8 KiB
Go
Raw Permalink Normal View History

2026-06-24 10:02:42 +00:00
package placement
import (
"context"
"fmt"
"strings"
2026-06-24 16:48:56 +00:00
"time"
2026-06-24 10:02:42 +00:00
libkg "haixun-backend/internal/library/knowledge"
2026-06-25 08:20:03 +00:00
"haixun-backend/internal/library/websearch"
2026-06-24 10:02:42 +00:00
)
const (
relevanceLimitPerTag = 12
recencyLimitPerTag = 8
)
type ScanCandidate struct {
Permalink string
ExternalID string
Author string
Text string
SearchTag string
QueryDimension QueryDimension
GraphNodeID string
ProductFitScore int
Source DiscoverChannel
HasRelevance bool
HasRecency bool
Priority string
2026-06-25 08:20:03 +00:00
AuthorVerified bool
FollowerCount int
2026-06-24 10:02:42 +00:00
LikeCount int
ReplyCount int
EngagementScore int
PlacementScore int
SolvedByProduct bool
2026-06-24 16:48:56 +00:00
PostedAt string
2026-06-24 10:02:42 +00:00
Replies []ReplyCandidate
}
type DualTrackInput struct {
2026-06-24 16:48:56 +00:00
Nodes []libkg.Node
PatrolKeywords []string
Exclusions []string
Member MemberContext
2026-06-25 08:20:03 +00:00
WebSearch websearch.Client
2026-06-24 16:48:56 +00:00
Crawler CrawlerSearchFn
Limit int // max queries budget; 0 = default
OnCheckpoint func(candidates []ScanCandidate) error
2026-06-24 10:02:42 +00:00
}
type DualTrackProgress func(message string, pct int)
// CollectTagQueries builds crawl jobs from selected graph nodes.
2026-06-25 08:20:03 +00:00
func CollectTagQueries(nodes []libkg.Node, provider websearch.Provider) []TagQuery {
2026-06-24 10:02:42 +00:00
out := make([]TagQuery, 0, len(nodes)*4)
for _, node := range nodes {
if !node.SelectedForScan {
continue
}
fit := node.ProductFitScore
2026-06-24 17:30:47 +00:00
derived := node.DerivedTags
if len(derived.Relevance) == 0 && len(derived.Recency) == 0 {
derived = libkg.DerivePatrolTagsForNode(node, libkg.PatrolTagInput{})
}
for _, tag := range derived.Relevance {
2026-06-24 10:02:42 +00:00
tag = strings.TrimSpace(tag)
if tag == "" {
continue
}
2026-06-25 08:20:03 +00:00
q := BuildRelevanceQuery(provider, tag)
2026-06-24 10:02:42 +00:00
if q == "" {
continue
}
out = append(out, TagQuery{
Tag: tag,
Query: q,
Dimension: QueryRelevance,
GraphNodeID: node.ID,
ProductFitScore: fit,
})
}
2026-06-24 17:30:47 +00:00
for _, tag := range derived.Recency {
2026-06-24 10:02:42 +00:00
tag = strings.TrimSpace(tag)
if tag == "" {
continue
}
2026-06-25 08:20:03 +00:00
q7 := BuildRecencyQuery(provider, tag, IdealMaxPostAgeDays)
2026-06-24 10:02:42 +00:00
if q7 != "" {
out = append(out, TagQuery{
Tag: tag,
Query: q7,
Dimension: QueryRecency,
GraphNodeID: node.ID,
ProductFitScore: fit,
RecencyDays: IdealMaxPostAgeDays,
})
}
2026-06-25 08:20:03 +00:00
q30 := BuildRecencyQuery(provider, tag, MaxPostAgeDays)
2026-06-24 10:02:42 +00:00
if q30 != "" && q30 != q7 {
out = append(out, TagQuery{
Tag: tag,
Query: q30,
Dimension: QueryRecency,
GraphNodeID: node.ID,
ProductFitScore: fit,
RecencyDays: MaxPostAgeDays,
})
}
}
}
return out
}
// RunDualTrackDiscover executes relevance + recency queries and merges by permalink.
func RunDualTrackDiscover(ctx context.Context, input DualTrackInput, onProgress DualTrackProgress) ([]ScanCandidate, error) {
2026-06-25 08:20:03 +00:00
queries := ResolveTagQueries(input.Nodes, input.PatrolKeywords, input.Member.WebSearchProviderEnum())
2026-06-24 10:02:42 +00:00
if len(queries) == 0 {
2026-06-24 16:48:56 +00:00
if len(input.PatrolKeywords) > 0 {
return nil, fmt.Errorf("海巡關鍵字格式無效,請改用 28 字的真人搜尋短句")
}
2026-06-24 17:30:47 +00:00
selected := 0
for _, node := range input.Nodes {
if node.SelectedForScan {
selected++
}
}
if selected > 0 {
return nil, fmt.Errorf("已勾選節點但沒有可用的海巡 tag請重新擴展圖譜或手動編輯 tag")
}
return nil, fmt.Errorf("請先勾選要海巡的節點並儲存")
2026-06-24 10:02:42 +00:00
}
merged := map[string]*ScanCandidate{}
order := make([]string, 0, 64)
runQuery := func(tq TagQuery, limit int) error {
posts, channel, err := discoverForQuery(ctx, input, tq, limit)
if err != nil {
return err
}
for _, post := range posts {
if MatchesExclusion(post.Text, input.Exclusions) {
continue
}
if !PassesPlacementFilter(post.Text) {
continue
}
key := post.Permalink
if key == "" {
continue
}
existing, ok := merged[key]
if !ok {
priority := "relevant"
if tq.Dimension == QueryRecency {
priority = "recent"
}
extID := post.ExternalID
if extID == "" {
if parsed, ok := ParseThreadsPostFromWebResult(post.Text, "", post.Permalink); ok {
extID = parsed.ExternalID
}
}
merged[key] = &ScanCandidate{
Permalink: post.Permalink,
ExternalID: extID,
Author: post.Author,
2026-06-25 08:20:03 +00:00
AuthorVerified: post.AuthorVerified,
FollowerCount: post.FollowerCount,
2026-06-24 10:02:42 +00:00
Text: post.Text,
SearchTag: tq.Tag,
QueryDimension: tq.Dimension,
GraphNodeID: tq.GraphNodeID,
ProductFitScore: tq.ProductFitScore,
Source: channel,
HasRelevance: tq.Dimension == QueryRelevance,
HasRecency: tq.Dimension == QueryRecency,
Priority: priority,
2026-06-25 08:20:03 +00:00
LikeCount: post.LikeCount,
ReplyCount: post.ReplyCount,
2026-06-24 10:02:42 +00:00
PlacementScore: computePlacementScore(post.Text, tq.ProductFitScore, tq.Dimension == QueryRecency),
SolvedByProduct: tq.ProductFitScore >= 55,
2026-06-24 16:48:56 +00:00
PostedAt: strings.TrimSpace(post.PostedAt),
2026-06-24 10:02:42 +00:00
}
order = append(order, key)
continue
}
if tq.Dimension == QueryRelevance {
existing.HasRelevance = true
}
if tq.Dimension == QueryRecency {
existing.HasRecency = true
}
if tq.ProductFitScore > existing.ProductFitScore {
existing.ProductFitScore = tq.ProductFitScore
existing.SolvedByProduct = tq.ProductFitScore >= 55
}
2026-06-24 16:48:56 +00:00
if strings.TrimSpace(existing.PostedAt) == "" && strings.TrimSpace(post.PostedAt) != "" {
existing.PostedAt = strings.TrimSpace(post.PostedAt)
}
2026-06-24 10:02:42 +00:00
}
return nil
}
total := len(queries)
for i, tq := range queries {
if onProgress != nil {
pct := 10 + ((i + 1) * 75 / max(total, 1))
onProgress(fmt.Sprintf("雙軌海巡 %d/%d%s", i+1, total, tq.Tag), pct)
}
limit := relevanceLimitPerTag
if tq.Dimension == QueryRecency {
limit = recencyLimitPerTag
}
if err := runQuery(tq, limit); err != nil {
return nil, err
}
2026-06-24 16:48:56 +00:00
if input.OnCheckpoint != nil {
snapshot := snapshotMergedCandidates(merged, order, false)
if err := input.OnCheckpoint(snapshot); err != nil {
return nil, err
}
2026-06-24 10:02:42 +00:00
}
2026-06-25 08:20:03 +00:00
if input.Member.AllowsCrawler && input.Member.BrowserConnected && i < total-1 {
2026-06-24 16:48:56 +00:00
if err := politeDiscoverPause(ctx); err != nil {
return nil, err
}
2026-06-24 10:02:42 +00:00
}
}
2026-06-24 16:48:56 +00:00
out := snapshotMergedCandidates(merged, order, true)
2026-06-24 10:02:42 +00:00
if onProgress != nil {
onProgress(fmt.Sprintf("合併完成,共 %d 篇候選貼文", len(out)), 90)
}
return out, nil
}
func discoverForQuery(ctx context.Context, input DualTrackInput, tq TagQuery, limit int) ([]DiscoverPost, DiscoverChannel, error) {
req := DiscoverRequest{
Query: tq.Query,
Keyword: tq.Tag,
Recency: tq.Dimension == QueryRecency,
Limit: limit,
Member: input.Member,
Crawler: input.Crawler,
}
posts, channel, err := Discover(ctx, req)
if err == nil && len(posts) > 0 {
return posts, channel, nil
}
2026-06-25 08:20:03 +00:00
if input.WebSearch == nil || !input.WebSearch.Enabled() {
2026-06-24 10:02:42 +00:00
if err != nil {
return nil, "", err
}
2026-06-25 08:20:03 +00:00
return nil, "", fmt.Errorf("%s 未設定且 Threads API 無結果", input.Member.WebSearchProviderLabel())
2026-06-24 10:02:42 +00:00
}
2026-06-25 08:20:03 +00:00
webPosts, werr := discoverViaWebSearch(ctx, input.WebSearch, input.Member, tq, limit)
if werr != nil {
2026-06-24 10:02:42 +00:00
if err != nil {
return nil, "", err
}
2026-06-25 08:20:03 +00:00
return nil, "", werr
2026-06-24 10:02:42 +00:00
}
2026-06-25 08:20:03 +00:00
return webPosts, input.Member.WebSearchDiscoverChannel(), nil
2026-06-24 10:02:42 +00:00
}
2026-06-25 08:20:03 +00:00
func discoverViaWebSearch(ctx context.Context, client websearch.Client, member MemberContext, tq TagQuery, limit int) ([]DiscoverPost, error) {
res, err := client.Search(ctx, websearch.SearchOptions{
Query: tq.Query,
Limit: limit,
Mode: websearch.ModeThreadsDiscover,
Country: member.BraveCountry,
SearchLang: member.BraveSearchLang,
UserLocation: member.ExaUserLocation,
StartPublishedDate: PublishedAfterForRecency(member.WebSearchProviderEnum(), tq.RecencyDays),
2026-06-24 10:02:42 +00:00
})
if err != nil {
return nil, err
}
if res.Status != "success" || len(res.Results) == 0 {
return nil, nil
}
2026-06-25 08:20:03 +00:00
source := member.WebSearchDiscoverChannel()
2026-06-24 10:02:42 +00:00
out := make([]DiscoverPost, 0, len(res.Results))
for _, item := range res.Results {
parsed, ok := ParseThreadsPostFromWebResult(item.Title, item.Snippet, item.URL)
if !ok {
continue
}
out = append(out, DiscoverPost{
Text: parsed.Text,
Permalink: parsed.Permalink,
ExternalID: parsed.ExternalID,
Author: parsed.Author,
2026-06-25 08:20:03 +00:00
Source: source,
2026-06-24 10:02:42 +00:00
})
}
return out, nil
}
2026-06-24 16:48:56 +00:00
func snapshotMergedCandidates(merged map[string]*ScanCandidate, order []string, applyFinalFilter bool) []ScanCandidate {
out := make([]ScanCandidate, 0, len(order))
for _, key := range order {
item := merged[key]
finalizeScanCandidate(item)
if applyFinalFilter && item.ProductFitScore < 30 && item.Priority != "gold" {
continue
}
out = append(out, *item)
}
return out
}
func finalizeScanCandidate(item *ScanCandidate) {
if item == nil {
return
}
if item.HasRelevance && item.HasRecency && item.ProductFitScore >= 45 {
item.Priority = "gold"
} else if item.HasRecency {
item.Priority = "recent"
} else {
item.Priority = "relevant"
}
item.PlacementScore = computePlacementScore(item.Text, item.ProductFitScore, item.HasRecency)
item.SolvedByProduct = item.ProductFitScore >= 55
}
2026-06-24 10:02:42 +00:00
func computePlacementScore(text string, productFit int, recent bool) int {
score := 30 + productFit/4
if HasPlacementIntent(text) {
score += 20
}
if LooksLikeRecommendationPost(text) {
score += 12
}
if recent {
score += 15
}
if productFit >= 60 {
score += 8
}
if score > 100 {
return 100
}
return score
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
2026-06-24 16:48:56 +00:00
func politeDiscoverPause(ctx context.Context) error {
wait := 2*time.Second + jitterDuration(2*time.Second)
timer := time.NewTimer(wait)
defer timer.Stop()
select {
case <-ctx.Done():
return ctx.Err()
case <-timer.C:
return nil
}
}