diff options
| author | nsfisis <nsfisis@gmail.com> | 2026-05-05 13:21:05 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2026-05-05 13:21:05 +0900 |
| commit | 7ee77a9e1002cc3c9e448361efe11a623bfd4fb9 (patch) | |
| tree | 643d95b50c30e2334e73e831e813592b98cbc421 /crates/mozart-vcs | |
| parent | 283ef7d1f4d85812dccd3b7e05441cfc05c59b4c (diff) | |
| download | php-mozart-7ee77a9e1002cc3c9e448361efe11a623bfd4fb9.tar.gz php-mozart-7ee77a9e1002cc3c9e448361efe11a623bfd4fb9.tar.zst php-mozart-7ee77a9e1002cc3c9e448361efe11a623bfd4fb9.zip | |
feat(vcs): use Composer-compatible URL-sanitize cache keys
Previously each VCS mirror was keyed by sha1(url), which made
cache directories opaque and incompatible with Composer's layout.
Composer's GitDriver and GitDownloader both use the form
Preg::replace('{[^a-z0-9.]}i', '-', Url::sanitize(\$url)), so a
Mozart user migrating from Composer (or vice versa) could not
share an existing cache.
Reimplement GitUtil::sanitize_url to follow that pattern: redact
credentials and access tokens (Url::sanitize semantics, including
the GitHub token regex), then replace every byte outside [a-zA-Z0-9.]
with '-'. The credential redaction also collapses URLs that differ
only in their access_token to the same key.
Diffstat (limited to 'crates/mozart-vcs')
| -rw-r--r-- | crates/mozart-vcs/Cargo.toml | 1 | ||||
| -rw-r--r-- | crates/mozart-vcs/src/util/git.rs | 124 |
2 files changed, 118 insertions, 7 deletions
diff --git a/crates/mozart-vcs/Cargo.toml b/crates/mozart-vcs/Cargo.toml index 18eff25..92b3e24 100644 --- a/crates/mozart-vcs/Cargo.toml +++ b/crates/mozart-vcs/Cargo.toml @@ -13,7 +13,6 @@ regex.workspace = true reqwest.workspace = true serde.workspace = true serde_json.workspace = true -sha1.workspace = true tokio.workspace = true tracing.workspace = true url.workspace = true diff --git a/crates/mozart-vcs/src/util/git.rs b/crates/mozart-vcs/src/util/git.rs index dce13b3..ab4366d 100644 --- a/crates/mozart-vcs/src/util/git.rs +++ b/crates/mozart-vcs/src/util/git.rs @@ -1,10 +1,27 @@ use std::path::{Path, PathBuf}; +use std::sync::LazyLock; use anyhow::{Result, bail}; -use sha1::{Digest, Sha1}; +use regex::Regex; use crate::process::{ProcessExecutor, ProcessOutput}; +/// Modern GitHub token pattern (40+ hex chars, `ghp_…`, `github_pat_…`). +/// +/// Mirrors `Composer\Util\GitHub::GITHUB_TOKEN_REGEX`. +static GITHUB_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| { + Regex::new(r"^([a-fA-F0-9]{12,}|gh[a-zA-Z]_[a-zA-Z0-9_]+|github_pat_[a-zA-Z0-9_]+)$").unwrap() +}); + +/// `[?&]access_token=...` query parameter. +static ACCESS_TOKEN_RE: LazyLock<Regex> = + LazyLock::new(|| Regex::new(r"([&?]access_token=)[^&]+").unwrap()); + +/// `<scheme>://user:password@` credential block. +static CREDENTIALS_RE: LazyLock<Regex> = LazyLock::new(|| { + Regex::new(r"(?i)(?P<prefix>[a-z0-9]+://)?(?P<user>[^:/\s@]+):(?P<password>[^@\s/]+)@").unwrap() +}); + /// Git utility for mirror management and protocol fallback. /// /// Corresponds to Composer's `Util\Git`. @@ -161,12 +178,25 @@ impl GitUtil { .map(|s| s.to_string()) } - /// Sanitize a URL for use as a directory name. + /// Sanitize a URL for use as a cache directory name. + /// + /// Mirrors Composer's `Preg::replace('{[^a-z0-9.]}i', '-', Url::sanitize($url))` + /// pattern (see `GitDriver::initialize` and `GitDownloader`): credentials and + /// access tokens are first redacted, then every byte outside `[a-zA-Z0-9.]` + /// is replaced with `-`. The redaction step keeps cache keys stable across + /// URLs that differ only in their embedded token. pub fn sanitize_url(url: &str) -> String { - let mut hasher = Sha1::new(); - hasher.update(url.as_bytes()); - let hash = hasher.finalize(); - format!("{:x}", hash) + let redacted = sanitize_url_credentials(url); + redacted + .chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '.' { + c + } else { + '-' + } + }) + .collect() } /// Get the cache mirror path for a URL. @@ -200,3 +230,85 @@ impl GitUtil { urls } } + +/// Redact credentials and access tokens from `url`. +/// +/// Mirrors Composer's `Util\Url::sanitize`. Two replacements are applied: +/// 1. `[?&]access_token=…` query values → `***` +/// 2. `<scheme>://user:password@` credentials → `***:***@` if `user` looks like +/// a GitHub token, otherwise just `user:***@` +fn sanitize_url_credentials(url: &str) -> String { + let url = ACCESS_TOKEN_RE.replace_all(url, "${1}***"); + CREDENTIALS_RE + .replace_all(&url, |caps: ®ex::Captures<'_>| { + let prefix = caps.name("prefix").map(|m| m.as_str()).unwrap_or(""); + let user = &caps["user"]; + if GITHUB_TOKEN_RE.is_match(user) { + format!("{prefix}***:***@") + } else { + format!("{prefix}{user}:***@") + } + }) + .into_owned() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sanitize_url_replaces_special_chars_with_dash() { + assert_eq!( + GitUtil::sanitize_url("https://github.com/owner/repo.git"), + "https---github.com-owner-repo.git" + ); + } + + #[test] + fn sanitize_url_preserves_dot() { + // Dot must survive — it appears in hostnames and ".git" suffixes. + let key = GitUtil::sanitize_url("git://example.org/foo.bar/baz.git"); + assert!(key.contains(".org")); + assert!(key.ends_with(".git")); + } + + #[test] + fn sanitize_url_redacts_password_in_credentials() { + let key = GitUtil::sanitize_url("https://alice:s3cret@example.com/repo.git"); + // Password is replaced with ***, then non-alphanumerics become '-'. + assert!(key.contains("alice")); + assert!(!key.contains("s3cret")); + } + + #[test] + fn sanitize_url_redacts_user_when_looks_like_github_token() { + // 40-hex token in the user position triggers full redaction. + let token = "abcdef0123456789abcdef0123456789abcdef01"; + let key = GitUtil::sanitize_url(&format!("https://{token}:x-oauth-basic@github.com/o/r")); + assert!(!key.contains("abcdef")); + } + + #[test] + fn sanitize_url_redacts_modern_github_pat() { + // ghp_xxx and github_pat_xxx forms. + let key1 = GitUtil::sanitize_url("https://ghp_abc123XYZ:x@github.com/o/r"); + assert!(!key1.contains("ghp_")); + let key2 = GitUtil::sanitize_url("https://github_pat_abc123:x@github.com/o/r"); + assert!(!key2.contains("github_pat_")); + } + + #[test] + fn sanitize_url_strips_access_token_query() { + let key = GitUtil::sanitize_url("https://api.github.com/x?access_token=secrettoken"); + assert!(!key.contains("secrettoken")); + } + + #[test] + fn sanitize_url_token_variants_share_cache_key() { + // Two pulls of the same repo with different access tokens should land + // in the same cache subdirectory. + let a = GitUtil::sanitize_url("https://api.github.com/repo?access_token=tokenA"); + let b = GitUtil::sanitize_url("https://api.github.com/repo?access_token=tokenB"); + assert_eq!(a, b); + } +} |
