Az generator package to 'az' sub package

This commit is contained in:
2025-05-21 16:53:31 +04:00
parent e900cd3d47
commit 58dce559d3
3 changed files with 32 additions and 30 deletions

129
pkg/generator/az/base.go Normal file
View File

@@ -0,0 +1,129 @@
package az
import (
"context"
"fmt"
"regexp"
"slices"
"github.com/aykhans/bsky-feedgen/pkg/storage/mongodb/collections"
"github.com/aykhans/bsky-feedgen/pkg/types"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
type Generator struct {
postCollection *collections.PostCollection
feedAzCollection *collections.FeedAzCollection
textRegex *regexp.Regexp
}
func NewGenerator(
postCollection *collections.PostCollection,
feedAzCollection *collections.FeedAzCollection,
) *Generator {
return &Generator{
postCollection: postCollection,
feedAzCollection: feedAzCollection,
textRegex: regexp.MustCompile("(?i)(azerbaijan|azərbaycan|aзербайджан|azerbaycan)"),
}
}
func (generator *Generator) Start(ctx context.Context, cursorOption types.GeneratorCursor, batchSize int) error {
var mongoCursor *mongo.Cursor
switch cursorOption {
case types.GeneratorCursorLastGenerated:
sequenceCursor, err := generator.feedAzCollection.GetMaxSequence(ctx)
if err != nil {
return err
}
if sequenceCursor == nil {
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.D{},
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
)
} else {
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.M{"sequence": bson.M{"$gt": *sequenceCursor}},
)
}
if err != nil {
return err
}
case types.GeneratorCursorFirstPost:
var err error
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.D{},
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
)
if err != nil {
return err
}
}
defer func() { _ = mongoCursor.Close(ctx) }()
feedAzBatch := []*collections.FeedAz{}
for mongoCursor.Next(ctx) {
var doc *collections.Post
if err := mongoCursor.Decode(&doc); err != nil {
return fmt.Errorf("mongodb cursor decode error: %v", err)
}
if generator.IsValid(doc) == false {
continue
}
feedAzBatch = append(
feedAzBatch,
&collections.FeedAz{
ID: doc.ID,
Sequence: doc.Sequence,
DID: doc.DID,
RecordKey: doc.RecordKey,
CreatedAt: doc.CreatedAt,
},
)
if len(feedAzBatch)%batchSize == 0 {
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
if err != nil {
return fmt.Errorf("insert FeedAz error: %v", err)
}
feedAzBatch = []*collections.FeedAz{}
}
}
if len(feedAzBatch) > 0 {
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
if err != nil {
return fmt.Errorf("insert FeedAz error: %v", err)
}
}
return nil
}
func (generator *Generator) IsValid(post *collections.Post) bool {
if post.Reply != nil && post.Reply.RootURI != post.Reply.ParentURI {
return false
}
if slices.Contains(invalidUsers, post.DID) {
return false
}
if slices.Contains(validUsers, post.DID) || // Posts from always-valid users
(slices.Contains(post.Langs, "az") && len(post.Langs) < 3) || // Posts in Azerbaijani language with fewer than 3 languages
generator.textRegex.MatchString(post.Text) { // Posts containing Azerbaijan-related keywords
return true
}
return false
}