feat(feed): auto-discover feed URLs from HTML pages

When an HTML page is provided instead of a direct feed URL, parse <link rel="alternate"> tags to find RSS/Atom feeds. Atom is preferred over RSS when both are present. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
author: nsfisis <nsfisis@gmail.com> 2026-02-14 12:20:31 +0900
committer: nsfisis <nsfisis@gmail.com> 2026-02-14 12:20:31 +0900
commit: 042fcb5c4eac16f18fc051f55a6c63ca9e97306b (patch)
tree: 1a61d1f7690e933a8d1e452e744ac02db14af042
parent: fffd36268a216044523c3f5227c3d375608c36dc (diff)
download: feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.tar.gz
feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.tar.zst
feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.zip
6 files changed, 289 insertions, 11 deletions
diff --git a/backend/api/handler_feeds.go b/backend/api/handler_feeds.go
index f0a8785..4d16e4b 100644
--- a/backend/api/handler_feeds.go
+++ b/backend/api/handler_feeds.go
@@ -46,14 +46,14 @@ func (h *Handler) FeedsAddFeed(ctx context.Context, request FeedsAddFeedRequestO
 		return nil, fmt.Errorf("authentication required")
 	}
 
-	f, err := feed.Fetch(ctx, request.Body.Url)
+	result, err := feed.Fetch(ctx, request.Body.Url)
 	if err != nil {
 		return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to parse feed: %v", err)}, nil
 	}
 
 	dbFeed, err := h.Queries.CreateFeed(ctx, db.CreateFeedParams{
-		Url:       request.Body.Url,
-		Title:     f.Title,
+		Url:       result.URL,
+		Title:     result.Feed.Title,
 		FetchedAt: time.Now().UTC().Format(time.RFC3339),
 		UserID:    userID,
 	})
@@ -61,7 +61,7 @@ func (h *Handler) FeedsAddFeed(ctx context.Context, request FeedsAddFeedRequestO
 		return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to insert feed: %v", err)}, nil
 	}
 
-	if err := feed.Sync(ctx, h.Queries, dbFeed.ID, f); err != nil {
+	if err := feed.Sync(ctx, h.Queries, dbFeed.ID, result.Feed); err != nil {
 		return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to sync articles: %v", err)}, nil
 	}
 
diff --git a/backend/cmd/serve.go b/backend/cmd/serve.go
index c65a19f..4b32868 100644
--- a/backend/cmd/serve.go
+++ b/backend/cmd/serve.go
@@ -25,11 +25,11 @@ import (
 
 func fetchOneFeed(feedID int64, url string, ctx context.Context, queries *db.Queries) error {
 	log.Printf("Fetching %s...\n", url)
-	f, err := feed.Fetch(ctx, url)
+	result, err := feed.Fetch(ctx, url)
 	if err != nil {
 		return err
 	}
-	return feed.Sync(ctx, queries, feedID, f)
+	return feed.Sync(ctx, queries, feedID, result.Feed)
 }
 
 func listFeedsToBeFetched(ctx context.Context, queries *db.Queries) (map[int64]string, error) {
diff --git a/backend/feed/discover.go b/backend/feed/discover.go
new file mode 100644
index 0000000..962cbcd
--- /dev/null
+++ b/backend/feed/discover.go
@@ -0,0 +1,94 @@
+package feed
+
+import (
+	"io"
+	"net/url"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+type feedLink struct {
+	URL  string
+	Type string // "atom" or "rss"
+}
+
+// discoverFeeds parses an HTML document and extracts feed URLs from
+// <link rel="alternate"> tags. It resolves relative URLs against baseURL.
+func discoverFeeds(body io.Reader, baseURL *url.URL) []feedLink {
+	var links []feedLink
+	z := html.NewTokenizer(body)
+	for {
+		tt := z.Next()
+		switch tt {
+		case html.ErrorToken:
+			return links
+		case html.StartTagToken, html.SelfClosingTagToken:
+			tn, _ := z.TagName()
+			tagName := string(tn)
+
+			if tagName == "body" {
+				return links
+			}
+			if tagName != "link" {
+				continue
+			}
+
+			attrs := tokenAttrs(z)
+			rel := strings.ToLower(attrs["rel"])
+			typ := strings.ToLower(attrs["type"])
+			href := attrs["href"]
+
+			if rel != "alternate" || href == "" {
+				continue
+			}
+
+			var feedType string
+			switch typ {
+			case "application/atom+xml":
+				feedType = "atom"
+			case "application/rss+xml":
+				feedType = "rss"
+			default:
+				continue
+			}
+
+			ref, err := url.Parse(href)
+			if err != nil {
+				continue
+			}
+			resolved := baseURL.ResolveReference(ref).String()
+			links = append(links, feedLink{URL: resolved, Type: feedType})
+		}
+	}
+}
+
+// selectFeed picks the best feed URL from discovered links.
+// Prefers Atom over RSS.
+func selectFeed(links []feedLink) string {
+	for _, l := range links {
+		if l.Type == "atom" {
+			return l.URL
+		}
+	}
+	for _, l := range links {
+		if l.Type == "rss" {
+			return l.URL
+		}
+	}
+	return ""
+}
+
+func tokenAttrs(z *html.Tokenizer) map[string]string {
+	attrs := make(map[string]string)
+	for {
+		key, val, more := z.TagAttr()
+		if len(key) > 0 {
+			attrs[string(key)] = string(val)
+		}
+		if !more {
+			break
+		}
+	}
+	return attrs
+}
diff --git a/backend/feed/discover_test.go b/backend/feed/discover_test.go
new file mode 100644
index 0000000..4a1315e
--- /dev/null
+++ b/backend/feed/discover_test.go
@@ -0,0 +1,129 @@
+package feed
+
+import (
+	"net/url"
+	"strings"
+	"testing"
+)
+
+func mustParseURL(s string) *url.URL {
+	u, err := url.Parse(s)
+	if err != nil {
+		panic(err)
+	}
+	return u
+}
+
+func TestDiscoverFeeds_AtomAndRSS(t *testing.T) {
+	html := `<!DOCTYPE html>
+<html>
+<head>
+    <link rel="alternate" type="application/atom+xml" href="/feed.atom" title="Atom Feed">
+    <link rel="alternate" type="application/rss+xml" href="/feed.rss" title="RSS Feed">
+</head>
+<body></body>
+</html>`
+
+	base := mustParseURL("https://example.com/blog")
+	links := discoverFeeds(strings.NewReader(html), base)
+
+	if len(links) != 2 {
+		t.Fatalf("expected 2 links, got %d", len(links))
+	}
+	if links[0].URL != "https://example.com/feed.atom" || links[0].Type != "atom" {
+		t.Errorf("unexpected first link: %+v", links[0])
+	}
+	if links[1].URL != "https://example.com/feed.rss" || links[1].Type != "rss" {
+		t.Errorf("unexpected second link: %+v", links[1])
+	}
+}
+
+func TestDiscoverFeeds_AbsoluteURL(t *testing.T) {
+	html := `<html><head>
+    <link rel="alternate" type="application/atom+xml" href="https://other.com/feed.xml">
+</head><body></body></html>`
+
+	base := mustParseURL("https://example.com")
+	links := discoverFeeds(strings.NewReader(html), base)
+
+	if len(links) != 1 {
+		t.Fatalf("expected 1 link, got %d", len(links))
+	}
+	if links[0].URL != "https://other.com/feed.xml" {
+		t.Errorf("expected absolute URL preserved, got %s", links[0].URL)
+	}
+}
+
+func TestDiscoverFeeds_NoFeeds(t *testing.T) {
+	html := `<html><head><title>No feeds</title></head><body></body></html>`
+	links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com"))
+	if len(links) != 0 {
+		t.Fatalf("expected 0 links, got %d", len(links))
+	}
+}
+
+func TestDiscoverFeeds_IgnoresNonAlternate(t *testing.T) {
+	html := `<html><head>
+    <link rel="stylesheet" type="text/css" href="/style.css">
+    <link rel="alternate" type="application/atom+xml" href="/feed.atom">
+</head><body></body></html>`
+
+	links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com"))
+	if len(links) != 1 {
+		t.Fatalf("expected 1 link, got %d", len(links))
+	}
+}
+
+func TestDiscoverFeeds_IgnoresUnknownTypes(t *testing.T) {
+	html := `<html><head>
+    <link rel="alternate" type="application/json" href="/feed.json">
+    <link rel="alternate" type="application/rss+xml" href="/feed.rss">
+</head><body></body></html>`
+
+	links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com"))
+	if len(links) != 1 {
+		t.Fatalf("expected 1 link, got %d", len(links))
+	}
+	if links[0].Type != "rss" {
+		t.Errorf("expected rss, got %s", links[0].Type)
+	}
+}
+
+func TestDiscoverFeeds_StopsAtBody(t *testing.T) {
+	html := `<html><head></head><body>
+    <link rel="alternate" type="application/atom+xml" href="/feed.atom">
+</body></html>`
+
+	links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com"))
+	if len(links) != 0 {
+		t.Fatalf("expected 0 links (should stop at body), got %d", len(links))
+	}
+}
+
+func TestSelectFeed_PrefersAtom(t *testing.T) {
+	links := []feedLink{
+		{URL: "https://example.com/rss", Type: "rss"},
+		{URL: "https://example.com/atom", Type: "atom"},
+	}
+	got := selectFeed(links)
+	if got != "https://example.com/atom" {
+		t.Errorf("expected Atom URL, got %s", got)
+	}
+}
+
+func TestSelectFeed_FallsBackToRSS(t *testing.T) {
+	links := []feedLink{
+		{URL: "https://example.com/rss", Type: "rss"},
+	}
+	got := selectFeed(links)
+	if got != "https://example.com/rss" {
+		t.Errorf("expected RSS URL, got %s", got)
+	}
+}
+
+func TestSelectFeed_EmptyList(t *testing.T) {
+	got := selectFeed(nil)
+	if got != "" {
+		t.Errorf("expected empty string, got %s", got)
+	}
+}
diff --git a/backend/feed/feed.go b/backend/feed/feed.go
index 4349d1e..2d84798 100644
--- a/backend/feed/feed.go
+++ b/backend/feed/feed.go
@@ -2,7 +2,10 @@ package feed
 
 import (
 	"context"
+	"errors"
 	"fmt"
+	"net/http"
+	"net/url"
 	"time"
 
 	"github.com/mmcdole/gofeed"
@@ -10,15 +13,67 @@ import (
 	"undef.ninja/x/feedaka/db"
 )
 
-func Fetch(ctx context.Context, url string) (*gofeed.Feed, error) {
+// FetchResult holds the result of fetching a feed, including the resolved URL.
+type FetchResult struct {
+	Feed *gofeed.Feed
+	URL  string
+}
+
+func Fetch(ctx context.Context, rawURL string) (*FetchResult, error) {
 	fp := gofeed.NewParser()
 	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
 	defer cancel()
-	feed, err := fp.ParseURLWithContext(url, ctx)
+
+	f, err := fp.ParseURLWithContext(rawURL, ctx)
+	if err == nil {
+		return &FetchResult{Feed: f, URL: rawURL}, nil
+	}
+
+	if !errors.Is(err, gofeed.ErrFeedTypeNotDetected) {
+		return nil, fmt.Errorf("failed to fetch %s: %w", rawURL, err)
+	}
+
+	discoveredURL, discoverErr := discoverFeedURL(ctx, rawURL)
+	if discoverErr != nil {
+		return nil, fmt.Errorf("failed to fetch %s: not a feed and auto-discovery failed: %w", rawURL, discoverErr)
+	}
+
+	f, err = fp.ParseURLWithContext(discoveredURL, ctx)
 	if err != nil {
-		return nil, fmt.Errorf("failed to fetch %s: %w", url, err)
+		return nil, fmt.Errorf("failed to fetch discovered feed %s: %w", discoveredURL, err)
 	}
-	return feed, nil
+
+	return &FetchResult{Feed: f, URL: discoveredURL}, nil
+}
+
+func discoverFeedURL(ctx context.Context, rawURL string) (string, error) {
+	req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
+	if err != nil {
+		return "", err
+	}
+
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return "", fmt.Errorf("HTTP %d", resp.StatusCode)
+	}
+
+	base, err := url.Parse(rawURL)
+	if err != nil {
+		return "", err
+	}
+
+	links := discoverFeeds(resp.Body, base)
+	feedURL := selectFeed(links)
+	if feedURL == "" {
+		return "", fmt.Errorf("no feed links found in HTML")
+	}
+
+	return feedURL, nil
 }
 
 func Sync(ctx context.Context, queries *db.Queries, feedID int64, f *gofeed.Feed) error {
diff --git a/backend/go.mod b/backend/go.mod
index d8a84eb..cb62854 100644
--- a/backend/go.mod
+++ b/backend/go.mod
@@ -13,6 +13,7 @@ require (
 	github.com/mmcdole/gofeed v1.3.0
 	github.com/oapi-codegen/runtime v1.1.2
 	golang.org/x/crypto v0.39.0
+	golang.org/x/net v0.41.0
 )
 
 require (
@@ -82,7 +83,6 @@ require (
 	go.uber.org/zap v1.27.0 // indirect
 	golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect
 	golang.org/x/mod v0.25.0 // indirect
-	golang.org/x/net v0.41.0 // indirect
 	golang.org/x/sync v0.15.0 // indirect
 	golang.org/x/sys v0.33.0 // indirect
 	golang.org/x/text v0.26.0 // indirect
author	nsfisis <nsfisis@gmail.com>	2026-02-14 12:20:31 +0900
committer	nsfisis <nsfisis@gmail.com>	2026-02-14 12:20:31 +0900
commit	042fcb5c4eac16f18fc051f55a6c63ca9e97306b (patch)
tree	1a61d1f7690e933a8d1e452e744ac02db14af042
parent	fffd36268a216044523c3f5227c3d375608c36dc (diff)
download	feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.tar.gz feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.tar.zst feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.zip