This commit is contained in:
2025-05-19 01:49:56 +04:00
commit e6fec752f9
52 changed files with 4337 additions and 0 deletions

130
pkg/generator/az.go Normal file
View File

@@ -0,0 +1,130 @@
package generator
import (
"context"
"fmt"
"regexp"
"slices"
"github.com/aykhans/bsky-feedgen/pkg/storage/mongodb/collections"
"github.com/aykhans/bsky-feedgen/pkg/types"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
var azValidUsers []string = []string{
"did:plc:jbt4qi6psd7rutwzedtecsq7",
"did:plc:yzgdpxsklrmfgqmjghdvw3ti",
"did:plc:cs2cbzojm6hmx5lfxiuft3mq",
}
type FeedGeneratorAz struct {
postCollection *collections.PostCollection
feedAzCollection *collections.FeedAzCollection
textRegex *regexp.Regexp
}
func NewFeedGeneratorAz(
postCollection *collections.PostCollection,
feedAzCollection *collections.FeedAzCollection,
) *FeedGeneratorAz {
return &FeedGeneratorAz{
postCollection: postCollection,
feedAzCollection: feedAzCollection,
textRegex: regexp.MustCompile("(?i)(azerbaijan|azərbaycan|aзербайджан|azerbaycan)"),
}
}
func (generator *FeedGeneratorAz) Start(ctx context.Context, cursorOption types.GeneratorCursor, batchSize int) error {
var mongoCursor *mongo.Cursor
switch cursorOption {
case types.GeneratorCursorLastGenerated:
sequenceCursor, err := generator.feedAzCollection.GetMaxSequence(ctx)
if err != nil {
return err
}
if sequenceCursor == nil {
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.D{},
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
)
} else {
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.M{"sequence": bson.M{"$gt": *sequenceCursor}},
)
}
if err != nil {
return err
}
case types.GeneratorCursorFirstPost:
var err error
mongoCursor, err = generator.postCollection.Collection.Find(
ctx,
bson.D{},
options.Find().SetSort(bson.D{{Key: "sequence", Value: 1}}),
)
if err != nil {
return err
}
}
defer func() { _ = mongoCursor.Close(ctx) }()
feedAzBatch := []*collections.FeedAz{}
for mongoCursor.Next(ctx) {
var doc *collections.Post
if err := mongoCursor.Decode(&doc); err != nil {
return fmt.Errorf("mongodb cursor decode error: %v", err)
}
if generator.IsValid(doc) == false {
continue
}
feedAzBatch = append(
feedAzBatch,
&collections.FeedAz{
ID: doc.ID,
Sequence: doc.Sequence,
DID: doc.DID,
RecordKey: doc.RecordKey,
CreatedAt: doc.CreatedAt,
},
)
if len(feedAzBatch)%batchSize == 0 {
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
if err != nil {
return fmt.Errorf("insert FeedAz error: %v", err)
}
feedAzBatch = []*collections.FeedAz{}
}
}
if len(feedAzBatch) > 0 {
err := generator.feedAzCollection.Insert(ctx, true, feedAzBatch...)
if err != nil {
return fmt.Errorf("insert FeedAz error: %v", err)
}
}
return nil
}
func (generator *FeedGeneratorAz) IsValid(post *collections.Post) bool {
if post.Reply != nil && post.Reply.RootURI != post.Reply.ParentURI {
return false
}
if slices.Contains(azValidUsers, post.DID) || // Posts from always-valid users
(slices.Contains(post.Langs, "az") && len(post.Langs) < 3) || // Posts in Azerbaijani language with fewer than 3 languages
generator.textRegex.MatchString(post.Text) { // Posts containing Azerbaijan-related keywords
return true
}
return false
}