136 lines
3.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package az
import (
"context"
"fmt"
"regexp"
"slices"
"github.com/aykhans/bsky-feedgen/pkg/storage/mongodb/collections"
"github.com/aykhans/bsky-feedgen/pkg/types"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
type Generator struct {
postCollection *collections.PostCollection
feedAzCollection *collections.FeedAzCollection
textRegex *regexp.Regexp
}
func NewGenerator(
postCollection *collections.PostCollection,
feedAzCollection *collections.FeedAzCollection,
) *Generator {
return &Generator{
postCollection: postCollection,
feedAzCollection: feedAzCollection,
textRegex: regexp.MustCompile("(?i)(azerbaijan|azərbaycan|aзербайджан|azerbaycan)"),
}
}
func (generator *Generator) Start(ctx context.Context, cursorOption types.GeneratorCursor, batchSize int) error {
var mongoCursor *mongo.Cursor
switch cursorOption {
case types.GeneratorCursorLastGenerated:
sequenceCursor, err := generator.feedAzCollection.GetMaxSequence(ctx)
if err != nil {
return err
}
if sequenceCursor == nil {
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.D{},
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
)
} else {
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.M{"sequence": bson.M{"$gt": *sequenceCursor}},
)
}
if err != nil {
return err
}
case types.GeneratorCursorFirstPost:
var err error
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.D{},
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
)
if err != nil {
return err
}
}
defer func() { _ = mongoCursor.Close(ctx) }()
feedAzBatch := []*collections.FeedAz{}
for mongoCursor.Next(ctx) {
var doc *collections.Post
if err := mongoCursor.Decode(&doc); err != nil {
return fmt.Errorf("mongodb cursor decode error: %v", err)
}
if generator.IsValid(doc) == false {
continue
}
feedAzBatch = append(
feedAzBatch,
&collections.FeedAz{
ID: doc.ID,
Sequence: doc.Sequence,
DID: doc.DID,
RecordKey: doc.RecordKey,
CreatedAt: doc.CreatedAt,
},
)
if len(feedAzBatch)%batchSize == 0 {
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
if err != nil {
return fmt.Errorf("insert FeedAz error: %v", err)
}
feedAzBatch = []*collections.FeedAz{}
}
}
if len(feedAzBatch) > 0 {
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
if err != nil {
return fmt.Errorf("insert FeedAz error: %v", err)
}
}
return nil
}
func (generator *Generator) IsValid(post *collections.Post) bool {
// Skip posts that are deep replies (not direct replies to original posts)
if post.Reply != nil && post.Reply.RootURI != post.Reply.ParentURI {
return false
}
// Check if the user who created this post is in our pre-defined list
// This allows for explicit inclusion/exclusion of specific users
if isValidUser := Users.IsValid(post.DID); isValidUser != nil {
return *isValidUser
}
// A post is considered valid if it meets either of the following criteria:
// 1. It's primarily in Azerbaijani (language code "az") with less than 3 detected languages
// (to filter out multi-language spam)
// 2. It contains Azerbaijan-related keywords in the text AND has at least one valid language
// from our approved language list
if (slices.Contains(post.Langs, "az") && len(post.Langs) < 3) ||
(generator.textRegex.MatchString(post.Text) && Langs.IsExistsAny(post.Langs)) {
return true
}
return false
}