thread-master/backend/internal/library/placement/dual_track.go

369 lines
9.8 KiB
Go
Raw Normal View History

2026-06-26 08:37:04 +00:00
package placement
import (
"context"
"fmt"
"strings"
"time"
libkg "haixun-backend/internal/library/knowledge"
"haixun-backend/internal/library/websearch"
)
const (
relevanceLimitPerTag = 12
recencyLimitPerTag = 8
)
type ScanCandidate struct {
Permalink string
ExternalID string
Author string
Text string
SearchTag string
QueryDimension QueryDimension
GraphNodeID string
ProductFitScore int
Source DiscoverChannel
HasRelevance bool
HasRecency bool
Priority string
AuthorVerified bool
FollowerCount int
LikeCount int
ReplyCount int
EngagementScore int
PlacementScore int
SolvedByProduct bool
PostedAt string
Replies []ReplyCandidate
}
type DualTrackInput struct {
Nodes []libkg.Node
PatrolKeywords []string
Exclusions []string
Member MemberContext
WebSearch websearch.Client
Crawler CrawlerSearchFn
Limit int // max queries budget; 0 = default
OnCheckpoint func(candidates []ScanCandidate) error
}
type DualTrackProgress func(message string, pct int)
// CollectTagQueries builds crawl jobs from selected graph nodes.
func CollectTagQueries(nodes []libkg.Node, provider websearch.Provider) []TagQuery {
out := make([]TagQuery, 0, len(nodes)*4)
for _, node := range nodes {
if !node.SelectedForScan {
continue
}
fit := node.ProductFitScore
derived := node.DerivedTags
if len(derived.Relevance) == 0 && len(derived.Recency) == 0 {
derived = libkg.DerivePatrolTagsForNode(node, libkg.PatrolTagInput{})
}
for _, tag := range derived.Relevance {
tag = strings.TrimSpace(tag)
if tag == "" {
continue
}
q := BuildRelevanceQuery(provider, tag)
if q == "" {
continue
}
out = append(out, TagQuery{
Tag: tag,
Query: q,
Dimension: QueryRelevance,
GraphNodeID: node.ID,
ProductFitScore: fit,
})
}
for _, tag := range derived.Recency {
tag = strings.TrimSpace(tag)
if tag == "" {
continue
}
q7 := BuildRecencyQuery(provider, tag, IdealMaxPostAgeDays)
if q7 != "" {
out = append(out, TagQuery{
Tag: tag,
Query: q7,
Dimension: QueryRecency,
GraphNodeID: node.ID,
ProductFitScore: fit,
RecencyDays: IdealMaxPostAgeDays,
})
}
q30 := BuildRecencyQuery(provider, tag, MaxPostAgeDays)
if q30 != "" && q30 != q7 {
out = append(out, TagQuery{
Tag: tag,
Query: q30,
Dimension: QueryRecency,
GraphNodeID: node.ID,
ProductFitScore: fit,
RecencyDays: MaxPostAgeDays,
})
}
}
}
return out
}
// RunDualTrackDiscover executes relevance + recency queries and merges by permalink.
func RunDualTrackDiscover(ctx context.Context, input DualTrackInput, onProgress DualTrackProgress) ([]ScanCandidate, error) {
queries := ResolveTagQueries(input.Nodes, input.PatrolKeywords, input.Member.WebSearchProviderEnum())
if len(queries) == 0 {
if len(input.PatrolKeywords) > 0 {
return nil, fmt.Errorf("海巡關鍵字格式無效,請改用 28 字的真人搜尋短句")
}
selected := 0
for _, node := range input.Nodes {
if node.SelectedForScan {
selected++
}
}
if selected > 0 {
return nil, fmt.Errorf("已勾選節點但沒有可用的海巡 tag請重新擴展圖譜或手動編輯 tag")
}
return nil, fmt.Errorf("請先勾選要海巡的節點並儲存")
}
merged := map[string]*ScanCandidate{}
order := make([]string, 0, 64)
runQuery := func(tq TagQuery, limit int) error {
posts, channel, err := discoverForQuery(ctx, input, tq, limit)
if err != nil {
return err
}
for _, post := range posts {
if MatchesExclusion(post.Text, input.Exclusions) {
continue
}
if !PassesPlacementFilter(post.Text) {
continue
}
key := post.Permalink
if key == "" {
continue
}
existing, ok := merged[key]
if !ok {
priority := "relevant"
if tq.Dimension == QueryRecency {
priority = "recent"
}
extID := post.ExternalID
if extID == "" {
if parsed, ok := ParseThreadsPostFromWebResult(post.Text, "", post.Permalink); ok {
extID = parsed.ExternalID
}
}
merged[key] = &ScanCandidate{
Permalink: post.Permalink,
ExternalID: extID,
Author: post.Author,
AuthorVerified: post.AuthorVerified,
FollowerCount: post.FollowerCount,
Text: post.Text,
SearchTag: tq.Tag,
QueryDimension: tq.Dimension,
GraphNodeID: tq.GraphNodeID,
ProductFitScore: tq.ProductFitScore,
Source: channel,
HasRelevance: tq.Dimension == QueryRelevance,
HasRecency: tq.Dimension == QueryRecency,
Priority: priority,
LikeCount: post.LikeCount,
ReplyCount: post.ReplyCount,
PlacementScore: computePlacementScore(post.Text, tq.ProductFitScore, tq.Dimension == QueryRecency),
SolvedByProduct: tq.ProductFitScore >= 55,
PostedAt: strings.TrimSpace(post.PostedAt),
}
order = append(order, key)
continue
}
if tq.Dimension == QueryRelevance {
existing.HasRelevance = true
}
if tq.Dimension == QueryRecency {
existing.HasRecency = true
}
if tq.ProductFitScore > existing.ProductFitScore {
existing.ProductFitScore = tq.ProductFitScore
existing.SolvedByProduct = tq.ProductFitScore >= 55
}
if strings.TrimSpace(existing.PostedAt) == "" && strings.TrimSpace(post.PostedAt) != "" {
existing.PostedAt = strings.TrimSpace(post.PostedAt)
}
}
return nil
}
total := len(queries)
for i, tq := range queries {
if onProgress != nil {
pct := 10 + ((i + 1) * 75 / max(total, 1))
onProgress(fmt.Sprintf("雙軌海巡 %d/%d%s", i+1, total, tq.Tag), pct)
}
limit := relevanceLimitPerTag
if tq.Dimension == QueryRecency {
limit = recencyLimitPerTag
}
if err := runQuery(tq, limit); err != nil {
return nil, err
}
if input.OnCheckpoint != nil {
snapshot := snapshotMergedCandidates(merged, order, false)
if err := input.OnCheckpoint(snapshot); err != nil {
return nil, err
}
}
if input.Member.AllowsCrawler && input.Member.BrowserConnected && i < total-1 {
if err := politeDiscoverPause(ctx); err != nil {
return nil, err
}
}
}
out := snapshotMergedCandidates(merged, order, true)
if onProgress != nil {
onProgress(fmt.Sprintf("合併完成,共 %d 篇候選貼文", len(out)), 90)
}
return out, nil
}
func discoverForQuery(ctx context.Context, input DualTrackInput, tq TagQuery, limit int) ([]DiscoverPost, DiscoverChannel, error) {
req := DiscoverRequest{
Query: tq.Query,
Keyword: tq.Tag,
Recency: tq.Dimension == QueryRecency,
Limit: limit,
Member: input.Member,
Crawler: input.Crawler,
}
posts, channel, err := Discover(ctx, req)
if err == nil && len(posts) > 0 {
return posts, channel, nil
}
if input.WebSearch == nil || !input.WebSearch.Enabled() {
if err != nil {
return nil, "", err
}
return nil, "", fmt.Errorf("%s 未設定且 Threads API 無結果", input.Member.WebSearchProviderLabel())
}
webPosts, werr := discoverViaWebSearch(ctx, input.WebSearch, input.Member, tq, limit)
if werr != nil {
if err != nil {
return nil, "", err
}
return nil, "", werr
}
return webPosts, input.Member.WebSearchDiscoverChannel(), nil
}
func discoverViaWebSearch(ctx context.Context, client websearch.Client, member MemberContext, tq TagQuery, limit int) ([]DiscoverPost, error) {
res, err := client.Search(ctx, websearch.SearchOptions{
Query: tq.Query,
Limit: limit,
Mode: websearch.ModeThreadsDiscover,
Country: member.BraveCountry,
SearchLang: member.BraveSearchLang,
UserLocation: member.ExaUserLocation,
StartPublishedDate: PublishedAfterForRecency(member.WebSearchProviderEnum(), tq.RecencyDays),
})
if err != nil {
return nil, err
}
if res.Status != "success" || len(res.Results) == 0 {
return nil, nil
}
source := member.WebSearchDiscoverChannel()
out := make([]DiscoverPost, 0, len(res.Results))
for _, item := range res.Results {
parsed, ok := ParseThreadsPostFromWebResult(item.Title, item.Snippet, item.URL)
if !ok {
continue
}
out = append(out, DiscoverPost{
Text: parsed.Text,
Permalink: parsed.Permalink,
ExternalID: parsed.ExternalID,
Author: parsed.Author,
Source: source,
})
}
return out, nil
}
func snapshotMergedCandidates(merged map[string]*ScanCandidate, order []string, applyFinalFilter bool) []ScanCandidate {
out := make([]ScanCandidate, 0, len(order))
for _, key := range order {
item := merged[key]
finalizeScanCandidate(item)
if applyFinalFilter && item.ProductFitScore < 30 && item.Priority != "gold" {
continue
}
out = append(out, *item)
}
return out
}
func finalizeScanCandidate(item *ScanCandidate) {
if item == nil {
return
}
if item.HasRelevance && item.HasRecency && item.ProductFitScore >= 45 {
item.Priority = "gold"
} else if item.HasRecency {
item.Priority = "recent"
} else {
item.Priority = "relevant"
}
item.PlacementScore = computePlacementScore(item.Text, item.ProductFitScore, item.HasRecency)
item.SolvedByProduct = item.ProductFitScore >= 55
}
func computePlacementScore(text string, productFit int, recent bool) int {
score := 30 + productFit/4
if HasPlacementIntent(text) {
score += 20
}
if LooksLikeRecommendationPost(text) {
score += 12
}
if recent {
score += 15
}
if productFit >= 60 {
score += 8
}
if score > 100 {
return 100
}
return score
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
func politeDiscoverPause(ctx context.Context) error {
wait := 2*time.Second + jitterDuration(2*time.Second)
timer := time.NewTimer(wait)
defer timer.Stop()
select {
case <-ctx.Done():
return ctx.Err()
case <-timer.C:
return nil
}
}