mirror of
https://github.com/aykhans/bsky-feedgen.git
synced 2025-06-01 19:07:34 +00:00
136 lines
3.6 KiB
Go
136 lines
3.6 KiB
Go
package az
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"regexp"
|
||
"slices"
|
||
|
||
"github.com/aykhans/bsky-feedgen/pkg/storage/mongodb/collections"
|
||
"github.com/aykhans/bsky-feedgen/pkg/types"
|
||
"go.mongodb.org/mongo-driver/bson"
|
||
"go.mongodb.org/mongo-driver/mongo"
|
||
"go.mongodb.org/mongo-driver/mongo/options"
|
||
)
|
||
|
||
type Generator struct {
|
||
postCollection *collections.PostCollection
|
||
feedAzCollection *collections.FeedAzCollection
|
||
textRegex *regexp.Regexp
|
||
}
|
||
|
||
func NewGenerator(
|
||
postCollection *collections.PostCollection,
|
||
feedAzCollection *collections.FeedAzCollection,
|
||
) *Generator {
|
||
return &Generator{
|
||
postCollection: postCollection,
|
||
feedAzCollection: feedAzCollection,
|
||
textRegex: regexp.MustCompile("(?i)(azerbaijan|azərbaycan|aзербайджан|azerbaycan)"),
|
||
}
|
||
}
|
||
|
||
func (generator *Generator) Start(ctx context.Context, cursorOption types.GeneratorCursor, batchSize int) error {
|
||
var mongoCursor *mongo.Cursor
|
||
switch cursorOption {
|
||
case types.GeneratorCursorLastGenerated:
|
||
sequenceCursor, err := generator.feedAzCollection.GetMaxSequence(ctx)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
if sequenceCursor == nil {
|
||
mongoCursor, err = generator.postCollection.Collection.Find(
|
||
ctx,
|
||
bson.D{},
|
||
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
|
||
)
|
||
} else {
|
||
mongoCursor, err = generator.postCollection.Collection.Find(
|
||
ctx,
|
||
bson.M{"sequence": bson.M{"$gt": *sequenceCursor}},
|
||
)
|
||
}
|
||
if err != nil {
|
||
return err
|
||
}
|
||
case types.GeneratorCursorFirstPost:
|
||
var err error
|
||
mongoCursor, err = generator.postCollection.Collection.Find(
|
||
ctx,
|
||
bson.D{},
|
||
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
|
||
)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
}
|
||
|
||
defer func() { _ = mongoCursor.Close(ctx) }()
|
||
|
||
feedAzBatch := []*collections.FeedAz{}
|
||
for mongoCursor.Next(ctx) {
|
||
var doc *collections.Post
|
||
if err := mongoCursor.Decode(&doc); err != nil {
|
||
return fmt.Errorf("mongodb cursor decode error: %v", err)
|
||
}
|
||
|
||
if generator.IsValid(doc) == false {
|
||
continue
|
||
}
|
||
|
||
feedAzBatch = append(
|
||
feedAzBatch,
|
||
&collections.FeedAz{
|
||
ID: doc.ID,
|
||
Sequence: doc.Sequence,
|
||
DID: doc.DID,
|
||
RecordKey: doc.RecordKey,
|
||
CreatedAt: doc.CreatedAt,
|
||
},
|
||
)
|
||
|
||
if len(feedAzBatch)%batchSize == 0 {
|
||
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
|
||
if err != nil {
|
||
return fmt.Errorf("insert FeedAz error: %v", err)
|
||
}
|
||
feedAzBatch = []*collections.FeedAz{}
|
||
}
|
||
}
|
||
|
||
if len(feedAzBatch) > 0 {
|
||
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
|
||
if err != nil {
|
||
return fmt.Errorf("insert FeedAz error: %v", err)
|
||
}
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
func (generator *Generator) IsValid(post *collections.Post) bool {
|
||
// Skip posts that are deep replies (not direct replies to original posts)
|
||
if post.Reply != nil && post.Reply.RootURI != post.Reply.ParentURI {
|
||
return false
|
||
}
|
||
|
||
// Check if the user who created this post is in our pre-defined list
|
||
// This allows for explicit inclusion/exclusion of specific users
|
||
if isValidUser := Users.IsValid(post.DID); isValidUser != nil {
|
||
return *isValidUser
|
||
}
|
||
|
||
// A post is considered valid if it meets either of the following criteria:
|
||
// 1. It's primarily in Azerbaijani (language code "az") with less than 3 detected languages
|
||
// (to filter out multi-language spam)
|
||
// 2. It contains Azerbaijan-related keywords in the text AND has at least one valid language
|
||
// from our approved language list
|
||
if (slices.Contains(post.Langs, "az") && len(post.Langs) < 3) ||
|
||
(generator.textRegex.MatchString(post.Text) && Langs.IsExistsAny(post.Langs)) {
|
||
return true
|
||
}
|
||
|
||
return false
|
||
}
|