aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authornsfisis <nsfisis@gmail.com>2026-02-14 12:20:31 +0900
committernsfisis <nsfisis@gmail.com>2026-02-14 12:20:31 +0900
commit042fcb5c4eac16f18fc051f55a6c63ca9e97306b (patch)
tree1a61d1f7690e933a8d1e452e744ac02db14af042
parentfffd36268a216044523c3f5227c3d375608c36dc (diff)
downloadfeedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.tar.gz
feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.tar.zst
feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.zip
feat(feed): auto-discover feed URLs from HTML pages
When an HTML page is provided instead of a direct feed URL, parse <link rel="alternate"> tags to find RSS/Atom feeds. Atom is preferred over RSS when both are present. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
-rw-r--r--backend/api/handler_feeds.go8
-rw-r--r--backend/cmd/serve.go4
-rw-r--r--backend/feed/discover.go94
-rw-r--r--backend/feed/discover_test.go129
-rw-r--r--backend/feed/feed.go63
-rw-r--r--backend/go.mod2
6 files changed, 289 insertions, 11 deletions
diff --git a/backend/api/handler_feeds.go b/backend/api/handler_feeds.go
index f0a8785..4d16e4b 100644
--- a/backend/api/handler_feeds.go
+++ b/backend/api/handler_feeds.go
@@ -46,14 +46,14 @@ func (h *Handler) FeedsAddFeed(ctx context.Context, request FeedsAddFeedRequestO
return nil, fmt.Errorf("authentication required")
}
- f, err := feed.Fetch(ctx, request.Body.Url)
+ result, err := feed.Fetch(ctx, request.Body.Url)
if err != nil {
return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to parse feed: %v", err)}, nil
}
dbFeed, err := h.Queries.CreateFeed(ctx, db.CreateFeedParams{
- Url: request.Body.Url,
- Title: f.Title,
+ Url: result.URL,
+ Title: result.Feed.Title,
FetchedAt: time.Now().UTC().Format(time.RFC3339),
UserID: userID,
})
@@ -61,7 +61,7 @@ func (h *Handler) FeedsAddFeed(ctx context.Context, request FeedsAddFeedRequestO
return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to insert feed: %v", err)}, nil
}
- if err := feed.Sync(ctx, h.Queries, dbFeed.ID, f); err != nil {
+ if err := feed.Sync(ctx, h.Queries, dbFeed.ID, result.Feed); err != nil {
return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to sync articles: %v", err)}, nil
}
diff --git a/backend/cmd/serve.go b/backend/cmd/serve.go
index c65a19f..4b32868 100644
--- a/backend/cmd/serve.go
+++ b/backend/cmd/serve.go
@@ -25,11 +25,11 @@ import (
func fetchOneFeed(feedID int64, url string, ctx context.Context, queries *db.Queries) error {
log.Printf("Fetching %s...\n", url)
- f, err := feed.Fetch(ctx, url)
+ result, err := feed.Fetch(ctx, url)
if err != nil {
return err
}
- return feed.Sync(ctx, queries, feedID, f)
+ return feed.Sync(ctx, queries, feedID, result.Feed)
}
func listFeedsToBeFetched(ctx context.Context, queries *db.Queries) (map[int64]string, error) {
diff --git a/backend/feed/discover.go b/backend/feed/discover.go
new file mode 100644
index 0000000..962cbcd
--- /dev/null
+++ b/backend/feed/discover.go
@@ -0,0 +1,94 @@
+package feed
+
+import (
+ "io"
+ "net/url"
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+type feedLink struct {
+ URL string
+ Type string // "atom" or "rss"
+}
+
+// discoverFeeds parses an HTML document and extracts feed URLs from
+// <link rel="alternate"> tags. It resolves relative URLs against baseURL.
+func discoverFeeds(body io.Reader, baseURL *url.URL) []feedLink {
+ var links []feedLink
+ z := html.NewTokenizer(body)
+ for {
+ tt := z.Next()
+ switch tt {
+ case html.ErrorToken:
+ return links
+ case html.StartTagToken, html.SelfClosingTagToken:
+ tn, _ := z.TagName()
+ tagName := string(tn)
+
+ if tagName == "body" {
+ return links
+ }
+ if tagName != "link" {
+ continue
+ }
+
+ attrs := tokenAttrs(z)
+ rel := strings.ToLower(attrs["rel"])
+ typ := strings.ToLower(attrs["type"])
+ href := attrs["href"]
+
+ if rel != "alternate" || href == "" {
+ continue
+ }
+
+ var feedType string
+ switch typ {
+ case "application/atom+xml":
+ feedType = "atom"
+ case "application/rss+xml":
+ feedType = "rss"
+ default:
+ continue
+ }
+
+ ref, err := url.Parse(href)
+ if err != nil {
+ continue
+ }
+ resolved := baseURL.ResolveReference(ref).String()
+ links = append(links, feedLink{URL: resolved, Type: feedType})
+ }
+ }
+}
+
+// selectFeed picks the best feed URL from discovered links.
+// Prefers Atom over RSS.
+func selectFeed(links []feedLink) string {
+ for _, l := range links {
+ if l.Type == "atom" {
+ return l.URL
+ }
+ }
+ for _, l := range links {
+ if l.Type == "rss" {
+ return l.URL
+ }
+ }
+ return ""
+}
+
+func tokenAttrs(z *html.Tokenizer) map[string]string {
+ attrs := make(map[string]string)
+ for {
+ key, val, more := z.TagAttr()
+ if len(key) > 0 {
+ attrs[string(key)] = string(val)
+ }
+ if !more {
+ break
+ }
+ }
+ return attrs
+}
diff --git a/backend/feed/discover_test.go b/backend/feed/discover_test.go
new file mode 100644
index 0000000..4a1315e
--- /dev/null
+++ b/backend/feed/discover_test.go
@@ -0,0 +1,129 @@
+package feed
+
+import (
+ "net/url"
+ "strings"
+ "testing"
+)
+
+func mustParseURL(s string) *url.URL {
+ u, err := url.Parse(s)
+ if err != nil {
+ panic(err)
+ }
+ return u
+}
+
+func TestDiscoverFeeds_AtomAndRSS(t *testing.T) {
+ html := `<!DOCTYPE html>
+<html>
+<head>
+ <link rel="alternate" type="application/atom+xml" href="/feed.atom" title="Atom Feed">
+ <link rel="alternate" type="application/rss+xml" href="/feed.rss" title="RSS Feed">
+</head>
+<body></body>
+</html>`
+
+ base := mustParseURL("https://example.com/blog")
+ links := discoverFeeds(strings.NewReader(html), base)
+
+ if len(links) != 2 {
+ t.Fatalf("expected 2 links, got %d", len(links))
+ }
+ if links[0].URL != "https://example.com/feed.atom" || links[0].Type != "atom" {
+ t.Errorf("unexpected first link: %+v", links[0])
+ }
+ if links[1].URL != "https://example.com/feed.rss" || links[1].Type != "rss" {
+ t.Errorf("unexpected second link: %+v", links[1])
+ }
+}
+
+func TestDiscoverFeeds_AbsoluteURL(t *testing.T) {
+ html := `<html><head>
+ <link rel="alternate" type="application/atom+xml" href="https://other.com/feed.xml">
+</head><body></body></html>`
+
+ base := mustParseURL("https://example.com")
+ links := discoverFeeds(strings.NewReader(html), base)
+
+ if len(links) != 1 {
+ t.Fatalf("expected 1 link, got %d", len(links))
+ }
+ if links[0].URL != "https://other.com/feed.xml" {
+ t.Errorf("expected absolute URL preserved, got %s", links[0].URL)
+ }
+}
+
+func TestDiscoverFeeds_NoFeeds(t *testing.T) {
+ html := `<html><head><title>No feeds</title></head><body></body></html>`
+ links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com"))
+ if len(links) != 0 {
+ t.Fatalf("expected 0 links, got %d", len(links))
+ }
+}
+
+func TestDiscoverFeeds_IgnoresNonAlternate(t *testing.T) {
+ html := `<html><head>
+ <link rel="stylesheet" type="text/css" href="/style.css">
+ <link rel="alternate" type="application/atom+xml" href="/feed.atom">
+</head><body></body></html>`
+
+ links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com"))
+ if len(links) != 1 {
+ t.Fatalf("expected 1 link, got %d", len(links))
+ }
+}
+
+func TestDiscoverFeeds_IgnoresUnknownTypes(t *testing.T) {
+ html := `<html><head>
+ <link rel="alternate" type="application/json" href="/feed.json">
+ <link rel="alternate" type="application/rss+xml" href="/feed.rss">
+</head><body></body></html>`
+
+ links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com"))
+ if len(links) != 1 {
+ t.Fatalf("expected 1 link, got %d", len(links))
+ }
+ if links[0].Type != "rss" {
+ t.Errorf("expected rss, got %s", links[0].Type)
+ }
+}
+
+func TestDiscoverFeeds_StopsAtBody(t *testing.T) {
+ html := `<html><head></head><body>
+ <link rel="alternate" type="application/atom+xml" href="/feed.atom">
+</body></html>`
+
+ links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com"))
+ if len(links) != 0 {
+ t.Fatalf("expected 0 links (should stop at body), got %d", len(links))
+ }
+}
+
+func TestSelectFeed_PrefersAtom(t *testing.T) {
+ links := []feedLink{
+ {URL: "https://example.com/rss", Type: "rss"},
+ {URL: "https://example.com/atom", Type: "atom"},
+ }
+ got := selectFeed(links)
+ if got != "https://example.com/atom" {
+ t.Errorf("expected Atom URL, got %s", got)
+ }
+}
+
+func TestSelectFeed_FallsBackToRSS(t *testing.T) {
+ links := []feedLink{
+ {URL: "https://example.com/rss", Type: "rss"},
+ }
+ got := selectFeed(links)
+ if got != "https://example.com/rss" {
+ t.Errorf("expected RSS URL, got %s", got)
+ }
+}
+
+func TestSelectFeed_EmptyList(t *testing.T) {
+ got := selectFeed(nil)
+ if got != "" {
+ t.Errorf("expected empty string, got %s", got)
+ }
+}
diff --git a/backend/feed/feed.go b/backend/feed/feed.go
index 4349d1e..2d84798 100644
--- a/backend/feed/feed.go
+++ b/backend/feed/feed.go
@@ -2,7 +2,10 @@ package feed
import (
"context"
+ "errors"
"fmt"
+ "net/http"
+ "net/url"
"time"
"github.com/mmcdole/gofeed"
@@ -10,15 +13,67 @@ import (
"undef.ninja/x/feedaka/db"
)
-func Fetch(ctx context.Context, url string) (*gofeed.Feed, error) {
+// FetchResult holds the result of fetching a feed, including the resolved URL.
+type FetchResult struct {
+ Feed *gofeed.Feed
+ URL string
+}
+
+func Fetch(ctx context.Context, rawURL string) (*FetchResult, error) {
fp := gofeed.NewParser()
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
- feed, err := fp.ParseURLWithContext(url, ctx)
+
+ f, err := fp.ParseURLWithContext(rawURL, ctx)
+ if err == nil {
+ return &FetchResult{Feed: f, URL: rawURL}, nil
+ }
+
+ if !errors.Is(err, gofeed.ErrFeedTypeNotDetected) {
+ return nil, fmt.Errorf("failed to fetch %s: %w", rawURL, err)
+ }
+
+ discoveredURL, discoverErr := discoverFeedURL(ctx, rawURL)
+ if discoverErr != nil {
+ return nil, fmt.Errorf("failed to fetch %s: not a feed and auto-discovery failed: %w", rawURL, discoverErr)
+ }
+
+ f, err = fp.ParseURLWithContext(discoveredURL, ctx)
if err != nil {
- return nil, fmt.Errorf("failed to fetch %s: %w", url, err)
+ return nil, fmt.Errorf("failed to fetch discovered feed %s: %w", discoveredURL, err)
}
- return feed, nil
+
+ return &FetchResult{Feed: f, URL: discoveredURL}, nil
+}
+
+func discoverFeedURL(ctx context.Context, rawURL string) (string, error) {
+ req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
+ if err != nil {
+ return "", err
+ }
+
+ resp, err := http.DefaultClient.Do(req)
+ if err != nil {
+ return "", err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return "", fmt.Errorf("HTTP %d", resp.StatusCode)
+ }
+
+ base, err := url.Parse(rawURL)
+ if err != nil {
+ return "", err
+ }
+
+ links := discoverFeeds(resp.Body, base)
+ feedURL := selectFeed(links)
+ if feedURL == "" {
+ return "", fmt.Errorf("no feed links found in HTML")
+ }
+
+ return feedURL, nil
}
func Sync(ctx context.Context, queries *db.Queries, feedID int64, f *gofeed.Feed) error {
diff --git a/backend/go.mod b/backend/go.mod
index d8a84eb..cb62854 100644
--- a/backend/go.mod
+++ b/backend/go.mod
@@ -13,6 +13,7 @@ require (
github.com/mmcdole/gofeed v1.3.0
github.com/oapi-codegen/runtime v1.1.2
golang.org/x/crypto v0.39.0
+ golang.org/x/net v0.41.0
)
require (
@@ -82,7 +83,6 @@ require (
go.uber.org/zap v1.27.0 // indirect
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect
golang.org/x/mod v0.25.0 // indirect
- golang.org/x/net v0.41.0 // indirect
golang.org/x/sync v0.15.0 // indirect
golang.org/x/sys v0.33.0 // indirect
golang.org/x/text v0.26.0 // indirect