From 485486c7ff986712ecb09e92217236d276d317c4 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 12 Jan 2026 08:46:55 +0000 Subject: feat: deduplicate feed items by guid Use ROW_NUMBER() window function to filter out duplicate articles with the same guid across different feeds. This ensures each guid appears only once in unread/read article lists. - Add idx_articles_guid index for PARTITION BY performance - Modify GetUnreadArticles and GetReadArticles queries with CTE --- backend/db/articles.sql.go | 40 ++++++++++++++++++---------- backend/db/migrations/005_add_guid_index.sql | 2 ++ backend/db/queries/articles.sql | 40 ++++++++++++++++++---------- backend/db/schema.sql | 2 ++ 4 files changed, 56 insertions(+), 28 deletions(-) create mode 100644 backend/db/migrations/005_add_guid_index.sql diff --git a/backend/db/articles.sql.go b/backend/db/articles.sql.go index bf6ea1d..2f6f80d 100644 --- a/backend/db/articles.sql.go +++ b/backend/db/articles.sql.go @@ -179,13 +179,19 @@ func (q *Queries) GetArticlesByFeed(ctx context.Context, feedID int64) ([]Articl } const getReadArticles = `-- name: GetReadArticles :many -SELECT - a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, - f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed -FROM articles AS a -INNER JOIN feeds AS f ON a.feed_id = f.id -WHERE a.is_read = 1 AND f.is_subscribed = 1 AND f.user_id = ? -ORDER BY a.id DESC +WITH ranked AS ( + SELECT + a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, + f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed, + ROW_NUMBER() OVER (PARTITION BY a.guid ORDER BY a.id) as rn + FROM articles AS a + INNER JOIN feeds AS f ON a.feed_id = f.id + WHERE a.is_read = 1 AND f.is_subscribed = 1 AND f.user_id = ? +) +SELECT id, feed_id, guid, title, url, is_read, feed_id_2, feed_url, feed_title, feed_is_subscribed +FROM ranked +WHERE rn = 1 +ORDER BY id DESC LIMIT 100 ` @@ -237,13 +243,19 @@ func (q *Queries) GetReadArticles(ctx context.Context, userID int64) ([]GetReadA } const getUnreadArticles = `-- name: GetUnreadArticles :many -SELECT - a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, - f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed -FROM articles AS a -INNER JOIN feeds AS f ON a.feed_id = f.id -WHERE a.is_read = 0 AND f.is_subscribed = 1 AND f.user_id = ? -ORDER BY a.id DESC +WITH ranked AS ( + SELECT + a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, + f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed, + ROW_NUMBER() OVER (PARTITION BY a.guid ORDER BY a.id) as rn + FROM articles AS a + INNER JOIN feeds AS f ON a.feed_id = f.id + WHERE a.is_read = 0 AND f.is_subscribed = 1 AND f.user_id = ? +) +SELECT id, feed_id, guid, title, url, is_read, feed_id_2, feed_url, feed_title, feed_is_subscribed +FROM ranked +WHERE rn = 1 +ORDER BY id DESC LIMIT 100 ` diff --git a/backend/db/migrations/005_add_guid_index.sql b/backend/db/migrations/005_add_guid_index.sql new file mode 100644 index 0000000..a653d79 --- /dev/null +++ b/backend/db/migrations/005_add_guid_index.sql @@ -0,0 +1,2 @@ +-- Add index on guid for deduplication queries +CREATE INDEX IF NOT EXISTS idx_articles_guid ON articles(guid); diff --git a/backend/db/queries/articles.sql b/backend/db/queries/articles.sql index 5acdada..bd4cd91 100644 --- a/backend/db/queries/articles.sql +++ b/backend/db/queries/articles.sql @@ -7,23 +7,35 @@ INNER JOIN feeds AS f ON a.feed_id = f.id WHERE a.id = ?; -- name: GetUnreadArticles :many -SELECT - a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, - f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed -FROM articles AS a -INNER JOIN feeds AS f ON a.feed_id = f.id -WHERE a.is_read = 0 AND f.is_subscribed = 1 AND f.user_id = ? -ORDER BY a.id DESC +WITH ranked AS ( + SELECT + a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, + f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed, + ROW_NUMBER() OVER (PARTITION BY a.guid ORDER BY a.id) as rn + FROM articles AS a + INNER JOIN feeds AS f ON a.feed_id = f.id + WHERE a.is_read = 0 AND f.is_subscribed = 1 AND f.user_id = ? +) +SELECT id, feed_id, guid, title, url, is_read, feed_id_2, feed_url, feed_title, feed_is_subscribed +FROM ranked +WHERE rn = 1 +ORDER BY id DESC LIMIT 100; -- name: GetReadArticles :many -SELECT - a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, - f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed -FROM articles AS a -INNER JOIN feeds AS f ON a.feed_id = f.id -WHERE a.is_read = 1 AND f.is_subscribed = 1 AND f.user_id = ? -ORDER BY a.id DESC +WITH ranked AS ( + SELECT + a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, + f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed, + ROW_NUMBER() OVER (PARTITION BY a.guid ORDER BY a.id) as rn + FROM articles AS a + INNER JOIN feeds AS f ON a.feed_id = f.id + WHERE a.is_read = 1 AND f.is_subscribed = 1 AND f.user_id = ? +) +SELECT id, feed_id, guid, title, url, is_read, feed_id_2, feed_url, feed_title, feed_is_subscribed +FROM ranked +WHERE rn = 1 +ORDER BY id DESC LIMIT 100; -- name: GetArticlesByFeed :many diff --git a/backend/db/schema.sql b/backend/db/schema.sql index adf3b37..07ac72d 100644 --- a/backend/db/schema.sql +++ b/backend/db/schema.sql @@ -34,4 +34,6 @@ CREATE INDEX IF NOT EXISTS idx_articles_feed_guid ON articles(feed_id, guid); CREATE INDEX IF NOT EXISTS idx_articles_is_read ON articles(is_read); +CREATE INDEX IF NOT EXISTS idx_articles_guid ON articles(guid); + CREATE INDEX IF NOT EXISTS idx_feeds_user_id ON feeds(user_id); -- cgit v1.2.3-70-g09d2 From 8524487824f7332223b24e75ab327bf6ec5eccc9 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 12 Jan 2026 08:58:04 +0000 Subject: refactor: deduplicate articles at insertion time instead of query time Change deduplication strategy from query-time (ROW_NUMBER window function) to insertion-time (global guid check before insert). Benefits: - Simpler queries without CTE/window functions - Consistent read state (no duplicate articles to manage) - Better query performance (no per-query deduplication overhead) Changes: - Add CheckArticleExistsByGUID query for global guid lookup - Add migration to remove existing duplicate articles - Modify fetchOneFeed and AddFeed to skip duplicates on insert - Revert GetUnreadArticles/GetReadArticles to simple queries --- backend/cmd/serve.go | 20 +++++-- backend/db/articles.sql.go | 82 +++++++++++++++++++--------- backend/db/migrations/005_add_guid_index.sql | 10 +++- backend/db/queries/articles.sql | 50 ++++++++--------- backend/graphql/resolver/schema.resolvers.go | 12 +++- 5 files changed, 116 insertions(+), 58 deletions(-) diff --git a/backend/cmd/serve.go b/backend/cmd/serve.go index 28b1282..30d0702 100644 --- a/backend/cmd/serve.go +++ b/backend/cmd/serve.go @@ -47,16 +47,18 @@ func fetchOneFeed(feedID int64, url string, ctx context.Context, queries *db.Que if err != nil { return err } + // Get GUIDs for this feed (for updating existing articles) guids, err := queries.GetArticleGUIDsByFeed(ctx, feedID) if err != nil { return err } - existingArticleGUIDs := make(map[string]bool) + existingFeedGUIDs := make(map[string]bool) for _, guid := range guids { - existingArticleGUIDs[guid] = true + existingFeedGUIDs[guid] = true } for _, item := range feed.Items { - if existingArticleGUIDs[item.GUID] { + if existingFeedGUIDs[item.GUID] { + // Article exists in this feed, update it err := queries.UpdateArticle(ctx, db.UpdateArticleParams{ Title: item.Title, Url: item.Link, @@ -67,7 +69,17 @@ func fetchOneFeed(feedID int64, url string, ctx context.Context, queries *db.Que return err } } else { - _, err := queries.CreateArticle(ctx, db.CreateArticleParams{ + // Check if article with same GUID exists globally (in any feed) + exists, err := queries.CheckArticleExistsByGUID(ctx, item.GUID) + if err != nil { + return err + } + if exists == 1 { + // Article already exists in another feed, skip + continue + } + // Create new article + _, err = queries.CreateArticle(ctx, db.CreateArticleParams{ FeedID: feedID, Guid: item.GUID, Title: item.Title, diff --git a/backend/db/articles.sql.go b/backend/db/articles.sql.go index 2f6f80d..41c3112 100644 --- a/backend/db/articles.sql.go +++ b/backend/db/articles.sql.go @@ -28,6 +28,48 @@ func (q *Queries) CheckArticleExists(ctx context.Context, arg CheckArticleExists return article_exists, err } +const checkArticleExistsByGUID = `-- name: CheckArticleExistsByGUID :one +SELECT EXISTS( + SELECT 1 FROM articles + WHERE guid = ? +) as article_exists +` + +func (q *Queries) CheckArticleExistsByGUID(ctx context.Context, guid string) (int64, error) { + row := q.db.QueryRowContext(ctx, checkArticleExistsByGUID, guid) + var article_exists int64 + err := row.Scan(&article_exists) + return article_exists, err +} + +const getAllArticleGUIDs = `-- name: GetAllArticleGUIDs :many +SELECT DISTINCT guid +FROM articles +` + +func (q *Queries) GetAllArticleGUIDs(ctx context.Context) ([]string, error) { + rows, err := q.db.QueryContext(ctx, getAllArticleGUIDs) + if err != nil { + return nil, err + } + defer rows.Close() + items := []string{} + for rows.Next() { + var guid string + if err := rows.Scan(&guid); err != nil { + return nil, err + } + items = append(items, guid) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const createArticle = `-- name: CreateArticle :one INSERT INTO articles (feed_id, guid, title, url, is_read) VALUES (?, ?, ?, ?, ?) @@ -179,19 +221,13 @@ func (q *Queries) GetArticlesByFeed(ctx context.Context, feedID int64) ([]Articl } const getReadArticles = `-- name: GetReadArticles :many -WITH ranked AS ( - SELECT - a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, - f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed, - ROW_NUMBER() OVER (PARTITION BY a.guid ORDER BY a.id) as rn - FROM articles AS a - INNER JOIN feeds AS f ON a.feed_id = f.id - WHERE a.is_read = 1 AND f.is_subscribed = 1 AND f.user_id = ? -) -SELECT id, feed_id, guid, title, url, is_read, feed_id_2, feed_url, feed_title, feed_is_subscribed -FROM ranked -WHERE rn = 1 -ORDER BY id DESC +SELECT + a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, + f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed +FROM articles AS a +INNER JOIN feeds AS f ON a.feed_id = f.id +WHERE a.is_read = 1 AND f.is_subscribed = 1 AND f.user_id = ? +ORDER BY a.id DESC LIMIT 100 ` @@ -243,19 +279,13 @@ func (q *Queries) GetReadArticles(ctx context.Context, userID int64) ([]GetReadA } const getUnreadArticles = `-- name: GetUnreadArticles :many -WITH ranked AS ( - SELECT - a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, - f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed, - ROW_NUMBER() OVER (PARTITION BY a.guid ORDER BY a.id) as rn - FROM articles AS a - INNER JOIN feeds AS f ON a.feed_id = f.id - WHERE a.is_read = 0 AND f.is_subscribed = 1 AND f.user_id = ? -) -SELECT id, feed_id, guid, title, url, is_read, feed_id_2, feed_url, feed_title, feed_is_subscribed -FROM ranked -WHERE rn = 1 -ORDER BY id DESC +SELECT + a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, + f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed +FROM articles AS a +INNER JOIN feeds AS f ON a.feed_id = f.id +WHERE a.is_read = 0 AND f.is_subscribed = 1 AND f.user_id = ? +ORDER BY a.id DESC LIMIT 100 ` diff --git a/backend/db/migrations/005_add_guid_index.sql b/backend/db/migrations/005_add_guid_index.sql index a653d79..e3625ee 100644 --- a/backend/db/migrations/005_add_guid_index.sql +++ b/backend/db/migrations/005_add_guid_index.sql @@ -1,2 +1,10 @@ --- Add index on guid for deduplication queries +-- Add index on guid for deduplication CREATE INDEX IF NOT EXISTS idx_articles_guid ON articles(guid); + +-- Remove duplicate articles by guid, keeping only the one with the smallest id +DELETE FROM articles +WHERE id NOT IN ( + SELECT MIN(id) + FROM articles + GROUP BY guid +); diff --git a/backend/db/queries/articles.sql b/backend/db/queries/articles.sql index bd4cd91..134b967 100644 --- a/backend/db/queries/articles.sql +++ b/backend/db/queries/articles.sql @@ -7,35 +7,23 @@ INNER JOIN feeds AS f ON a.feed_id = f.id WHERE a.id = ?; -- name: GetUnreadArticles :many -WITH ranked AS ( - SELECT - a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, - f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed, - ROW_NUMBER() OVER (PARTITION BY a.guid ORDER BY a.id) as rn - FROM articles AS a - INNER JOIN feeds AS f ON a.feed_id = f.id - WHERE a.is_read = 0 AND f.is_subscribed = 1 AND f.user_id = ? -) -SELECT id, feed_id, guid, title, url, is_read, feed_id_2, feed_url, feed_title, feed_is_subscribed -FROM ranked -WHERE rn = 1 -ORDER BY id DESC +SELECT + a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, + f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed +FROM articles AS a +INNER JOIN feeds AS f ON a.feed_id = f.id +WHERE a.is_read = 0 AND f.is_subscribed = 1 AND f.user_id = ? +ORDER BY a.id DESC LIMIT 100; -- name: GetReadArticles :many -WITH ranked AS ( - SELECT - a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, - f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed, - ROW_NUMBER() OVER (PARTITION BY a.guid ORDER BY a.id) as rn - FROM articles AS a - INNER JOIN feeds AS f ON a.feed_id = f.id - WHERE a.is_read = 1 AND f.is_subscribed = 1 AND f.user_id = ? -) -SELECT id, feed_id, guid, title, url, is_read, feed_id_2, feed_url, feed_title, feed_is_subscribed -FROM ranked -WHERE rn = 1 -ORDER BY id DESC +SELECT + a.id, a.feed_id, a.guid, a.title, a.url, a.is_read, + f.id as feed_id_2, f.url as feed_url, f.title as feed_title, f.is_subscribed as feed_is_subscribed +FROM articles AS a +INNER JOIN feeds AS f ON a.feed_id = f.id +WHERE a.is_read = 1 AND f.is_subscribed = 1 AND f.user_id = ? +ORDER BY a.id DESC LIMIT 100; -- name: GetArticlesByFeed :many @@ -83,3 +71,13 @@ SELECT EXISTS( SELECT 1 FROM articles WHERE feed_id = ? AND guid = ? ) as article_exists; + +-- name: CheckArticleExistsByGUID :one +SELECT EXISTS( + SELECT 1 FROM articles + WHERE guid = ? +) as article_exists; + +-- name: GetAllArticleGUIDs :many +SELECT DISTINCT guid +FROM articles; diff --git a/backend/graphql/resolver/schema.resolvers.go b/backend/graphql/resolver/schema.resolvers.go index 46c39e7..c3f6f0a 100644 --- a/backend/graphql/resolver/schema.resolvers.go +++ b/backend/graphql/resolver/schema.resolvers.go @@ -43,8 +43,18 @@ func (r *mutationResolver) AddFeed(ctx context.Context, url string) (*model.Feed return nil, fmt.Errorf("failed to insert feed: %w", err) } - // Insert articles from the feed + // Insert articles from the feed (skip duplicates by guid) for _, item := range feed.Items { + // Check if article with same GUID already exists globally + exists, err := r.Queries.CheckArticleExistsByGUID(ctx, item.GUID) + if err != nil { + fmt.Printf("Failed to check article existence: %v\n", err) + continue + } + if exists == 1 { + // Article already exists, skip + continue + } _, err = r.Queries.CreateArticle(ctx, db.CreateArticleParams{ FeedID: dbFeed.ID, Guid: item.GUID, -- cgit v1.2.3-70-g09d2 From 86254f8eb7f812915464c8733f7017a6e0a3dd4d Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 12 Jan 2026 09:07:43 +0000 Subject: refactor: remove unused GetAllArticleGUIDs query --- backend/db/articles.sql.go | 28 ---------------------------- backend/db/queries/articles.sql | 4 ---- 2 files changed, 32 deletions(-) diff --git a/backend/db/articles.sql.go b/backend/db/articles.sql.go index 41c3112..7f6400b 100644 --- a/backend/db/articles.sql.go +++ b/backend/db/articles.sql.go @@ -42,34 +42,6 @@ func (q *Queries) CheckArticleExistsByGUID(ctx context.Context, guid string) (in return article_exists, err } -const getAllArticleGUIDs = `-- name: GetAllArticleGUIDs :many -SELECT DISTINCT guid -FROM articles -` - -func (q *Queries) GetAllArticleGUIDs(ctx context.Context) ([]string, error) { - rows, err := q.db.QueryContext(ctx, getAllArticleGUIDs) - if err != nil { - return nil, err - } - defer rows.Close() - items := []string{} - for rows.Next() { - var guid string - if err := rows.Scan(&guid); err != nil { - return nil, err - } - items = append(items, guid) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - const createArticle = `-- name: CreateArticle :one INSERT INTO articles (feed_id, guid, title, url, is_read) VALUES (?, ?, ?, ?, ?) diff --git a/backend/db/queries/articles.sql b/backend/db/queries/articles.sql index 134b967..2c00678 100644 --- a/backend/db/queries/articles.sql +++ b/backend/db/queries/articles.sql @@ -77,7 +77,3 @@ SELECT EXISTS( SELECT 1 FROM articles WHERE guid = ? ) as article_exists; - --- name: GetAllArticleGUIDs :many -SELECT DISTINCT guid -FROM articles; -- cgit v1.2.3-70-g09d2