From 813b1f553410dee01466064d03ffccfaa617e061 Mon Sep 17 00:00:00 2001 From: AHMET YILMAZ Date: Mon, 8 Sep 2025 19:09:33 +0800 Subject: [PATCH] #1268 fix: update redirected_url to current page URL and enhance normalize_url function --- crawl4ai/async_crawler_strategy.py | 2 +- crawl4ai/async_webcrawler.py | 2 +- crawl4ai/utils.py | 59 ++++++++++++++++++++++++++---- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 943867d0b..6c3a5f556 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1037,7 +1037,7 @@ async def get_delayed_content(delay: float = 5.0) -> str: downloaded_files=( self._downloaded_files if self._downloaded_files else None ), - redirected_url=redirected_url, + redirected_url=page.url, # Update to current URL in case of JavaScript navigation # Include captured data if enabled network_requests=captured_requests if config.capture_network_requests else None, console_messages=captured_console if config.capture_console_messages else None, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index f12fc488e..dc270c9c7 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -480,7 +480,7 @@ async def aprocess_html( # Scraping Strategy Execution # ################################ result: ScrapingResult = scraping_strategy.scrap( - url, html, **params) + kwargs.get("redirected_url", url), html, **params) if result is None: raise ValueError( diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 046351e7d..37221e916 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2149,8 +2149,10 @@ def normalize_url( *, drop_query_tracking=True, sort_query=True, - keep_fragment=False, + keep_fragment=True, + remove_fragments=None, # alias for keep_fragment=False extra_drop_params=None, + params_to_remove=None, # alias for extra_drop_params preserve_https=False, original_scheme=None ): @@ -2175,10 +2177,20 @@ def normalize_url( Returns ------- str | None - A clean, canonical URL or None if href is empty/None. + A clean, canonical URL or the base URL if href is empty/None. """ if not href: - return None + # For empty href, return the base URL (matching urljoin behavior) + return base_url + + # Validate base URL format + parsed_base = urlparse(base_url) + if not parsed_base.scheme or not parsed_base.netloc: + raise ValueError(f"Invalid base URL format: {base_url}") + + if parsed_base.scheme.lower() not in ["http", "https"]: + # Handle special protocols + raise ValueError(f"Invalid base URL format: {base_url}") # Resolve relative paths first full_url = urljoin(base_url, href.strip()) @@ -2199,6 +2211,12 @@ def normalize_url( # ── netloc ── netloc = parsed.netloc.lower() + + # Remove default ports (80 for http, 443 for https) + if ':' in netloc: + host, port = netloc.rsplit(':', 1) + if (parsed.scheme == 'http' and port == '80') or (parsed.scheme == 'https' and port == '443'): + netloc = host # ── path ── # Strip duplicate slashes and trailing "/" (except root) @@ -2206,7 +2224,17 @@ def normalize_url( # The path from urlparse is already properly encoded path = parsed.path if path.endswith('/') and path != '/': - path = path.rstrip('/') + # Only strip trailing slash if the original href didn't have a trailing slash + # and the base_url didn't end with a slash + base_parsed = urlparse(base_url) + if not href.strip().endswith('/') and not base_parsed.path.endswith('/'): + path = path.rstrip('/') + # Add trailing slash for URLs without explicit paths (indicates directory) + # But skip this for special protocols that don't use standard URL structure + elif not path: + special_protocols = {"javascript:", "mailto:", "tel:", "file:", "data:"} + if not any(href.strip().lower().startswith(p) for p in special_protocols): + path = '/' # ── query ── query = parsed.query @@ -2221,6 +2249,8 @@ def normalize_url( } if extra_drop_params: default_tracking |= {p.lower() for p in extra_drop_params} + if params_to_remove: + default_tracking |= {p.lower() for p in params_to_remove} params = [(k, v) for k, v in params if k not in default_tracking] if sort_query: @@ -2229,7 +2259,10 @@ def normalize_url( query = urlencode(params, doseq=True) if params else '' # ── fragment ── - fragment = parsed.fragment if keep_fragment else '' + if remove_fragments is True: + fragment = '' + else: + fragment = parsed.fragment if keep_fragment else '' # Re-assemble normalized = urlunparse(( @@ -2453,9 +2486,19 @@ def is_external_url(url: str, base_domain: str) -> bool: if not parsed.netloc: # Relative URL return False - # Strip 'www.' from both domains for comparison - url_domain = parsed.netloc.lower().replace("www.", "") - base = base_domain.lower().replace("www.", "") + # Don't strip 'www.' from domains for comparison - treat www.example.com and example.com as different + url_domain = parsed.netloc.lower() + base = base_domain.lower() + + # Strip user credentials from URL domain + if '@' in url_domain: + url_domain = url_domain.split('@', 1)[1] + + # Strip ports from both for comparison (any port should be considered same domain) + if ':' in url_domain: + url_domain = url_domain.rsplit(':', 1)[0] + if ':' in base: + base = base.rsplit(':', 1)[0] # Check if URL domain ends with base domain return not url_domain.endswith(base)