Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,7 +1037,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
downloaded_files=(
self._downloaded_files if self._downloaded_files else None
),
redirected_url=redirected_url,
redirected_url=page.url, # Update to current URL in case of JavaScript navigation
# Include captured data if enabled
network_requests=captured_requests if config.capture_network_requests else None,
console_messages=captured_console if config.capture_console_messages else None,
Expand Down
2 changes: 1 addition & 1 deletion crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ async def aprocess_html(
# Scraping Strategy Execution #
################################
result: ScrapingResult = scraping_strategy.scrap(
url, html, **params)
kwargs.get("redirected_url", url), html, **params)

if result is None:
raise ValueError(
Expand Down
59 changes: 51 additions & 8 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2149,8 +2149,10 @@ def normalize_url(
*,
drop_query_tracking=True,
sort_query=True,
keep_fragment=False,
keep_fragment=True,
remove_fragments=None, # alias for keep_fragment=False
extra_drop_params=None,
params_to_remove=None, # alias for extra_drop_params
preserve_https=False,
original_scheme=None
):
Expand All @@ -2175,10 +2177,20 @@ def normalize_url(
Returns
-------
str | None
A clean, canonical URL or None if href is empty/None.
A clean, canonical URL or the base URL if href is empty/None.
"""
if not href:
return None
# For empty href, return the base URL (matching urljoin behavior)
return base_url

# Validate base URL format
parsed_base = urlparse(base_url)
if not parsed_base.scheme or not parsed_base.netloc:
raise ValueError(f"Invalid base URL format: {base_url}")

if parsed_base.scheme.lower() not in ["http", "https"]:
# Handle special protocols
raise ValueError(f"Invalid base URL format: {base_url}")

# Resolve relative paths first
full_url = urljoin(base_url, href.strip())
Expand All @@ -2199,14 +2211,30 @@ def normalize_url(

# ── netloc ──
netloc = parsed.netloc.lower()

# Remove default ports (80 for http, 443 for https)
if ':' in netloc:
host, port = netloc.rsplit(':', 1)
if (parsed.scheme == 'http' and port == '80') or (parsed.scheme == 'https' and port == '443'):
netloc = host

# ── path ──
# Strip duplicate slashes and trailing "/" (except root)
# IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
# The path from urlparse is already properly encoded
path = parsed.path
if path.endswith('/') and path != '/':
path = path.rstrip('/')
# Only strip trailing slash if the original href didn't have a trailing slash
# and the base_url didn't end with a slash
base_parsed = urlparse(base_url)
if not href.strip().endswith('/') and not base_parsed.path.endswith('/'):
path = path.rstrip('/')
# Add trailing slash for URLs without explicit paths (indicates directory)
# But skip this for special protocols that don't use standard URL structure
elif not path:
special_protocols = {"javascript:", "mailto:", "tel:", "file:", "data:"}
if not any(href.strip().lower().startswith(p) for p in special_protocols):
path = '/'

# ── query ──
query = parsed.query
Expand All @@ -2221,6 +2249,8 @@ def normalize_url(
}
if extra_drop_params:
default_tracking |= {p.lower() for p in extra_drop_params}
if params_to_remove:
default_tracking |= {p.lower() for p in params_to_remove}
params = [(k, v) for k, v in params if k not in default_tracking]

if sort_query:
Expand All @@ -2229,7 +2259,10 @@ def normalize_url(
query = urlencode(params, doseq=True) if params else ''

# ── fragment ──
fragment = parsed.fragment if keep_fragment else ''
if remove_fragments is True:
fragment = ''
else:
fragment = parsed.fragment if keep_fragment else ''

# Re-assemble
normalized = urlunparse((
Expand Down Expand Up @@ -2453,9 +2486,19 @@ def is_external_url(url: str, base_domain: str) -> bool:
if not parsed.netloc: # Relative URL
return False

# Strip 'www.' from both domains for comparison
url_domain = parsed.netloc.lower().replace("www.", "")
base = base_domain.lower().replace("www.", "")
# Don't strip 'www.' from domains for comparison - treat www.example.com and example.com as different
url_domain = parsed.netloc.lower()
base = base_domain.lower()

# Strip user credentials from URL domain
if '@' in url_domain:
url_domain = url_domain.split('@', 1)[1]

# Strip ports from both for comparison (any port should be considered same domain)
if ':' in url_domain:
url_domain = url_domain.rsplit(':', 1)[0]
if ':' in base:
base = base.rsplit(':', 1)[0]

# Check if URL domain ends with base domain
return not url_domain.endswith(base)
Expand Down