2025-05-19 01:49:56 +04:00

131 lines
3.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package generator
import (
"context"
"fmt"
"regexp"
"slices"
"github.com/aykhans/bsky-feedgen/pkg/storage/mongodb/collections"
"github.com/aykhans/bsky-feedgen/pkg/types"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
var azValidUsers []string = []string{
"did:plc:jbt4qi6psd7rutwzedtecsq7",
"did:plc:yzgdpxsklrmfgqmjghdvw3ti",
"did:plc:cs2cbzojm6hmx5lfxiuft3mq",
}
type FeedGeneratorAz struct {
postCollection *collections.PostCollection
feedAzCollection *collections.FeedAzCollection
textRegex *regexp.Regexp
}
func NewFeedGeneratorAz(
postCollection *collections.PostCollection,
feedAzCollection *collections.FeedAzCollection,
) *FeedGeneratorAz {
return &FeedGeneratorAz{
postCollection: postCollection,
feedAzCollection: feedAzCollection,
textRegex: regexp.MustCompile("(?i)(azerbaijan|azərbaycan|aзербайджан|azerbaycan)"),
}
}
func (generator *FeedGeneratorAz) Start(ctx context.Context, cursorOption types.GeneratorCursor, batchSize int) error {
var mongoCursor *mongo.Cursor
switch cursorOption {
case types.GeneratorCursorLastGenerated:
sequenceCursor, err := generator.feedAzCollection.GetMaxSequence(ctx)
if err != nil {
return err
}
if sequenceCursor == nil {
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.D{},
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
)
} else {
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.M{"sequence": bson.M{"$gt": *sequenceCursor}},
)
}
if err != nil {
return err
}
case types.GeneratorCursorFirstPost:
var err error
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.D{},
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
)
if err != nil {
return err
}
}
defer func() { _ = mongoCursor.Close(ctx) }()
feedAzBatch := []*collections.FeedAz{}
for mongoCursor.Next(ctx) {
var doc *collections.Post
if err := mongoCursor.Decode(&doc); err != nil {
return fmt.Errorf("mongodb cursor decode error: %v", err)
}
if generator.IsValid(doc) == false {
continue
}
feedAzBatch = append(
feedAzBatch,
&collections.FeedAz{
ID: doc.ID,
Sequence: doc.Sequence,
DID: doc.DID,
RecordKey: doc.RecordKey,
CreatedAt: doc.CreatedAt,
},
)
if len(feedAzBatch)%batchSize == 0 {
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
if err != nil {
return fmt.Errorf("insert FeedAz error: %v", err)
}
feedAzBatch = []*collections.FeedAz{}
}
}
if len(feedAzBatch) > 0 {
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
if err != nil {
return fmt.Errorf("insert FeedAz error: %v", err)
}
}
return nil
}
func (generator *FeedGeneratorAz) IsValid(post *collections.Post) bool {
if post.Reply != nil && post.Reply.RootURI != post.Reply.ParentURI {
return false
}
if slices.Contains(azValidUsers, post.DID) || // Posts from always-valid users
(slices.Contains(post.Langs, "az") && len(post.Langs) < 3) || // Posts in Azerbaijani language with fewer than 3 languages
generator.textRegex.MatchString(post.Text) { // Posts containing Azerbaijan-related keywords
return true
}
return false
}