"""Web fetch tool implementation."""


TOOL_VERSION = "2026-05-22"

from __future__ import annotations

import re
import urllib.error
import urllib.request
from typing import Any

from harzoo.agent.kernel.tool import Tool, ToolResult


def _extract_text(html: str) -> str:
    html = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
    html = re.sub(r"<style[^>]*>.*?</style>", "", html, flags=re.DOTALL | re.IGNORECASE)
    html = re.sub(r"<!--.*?-->", "", html, flags=re.DOTALL)
    html = re.sub(r"<(nav|footer|aside)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
    html = re.sub(r"<br[^>]*>", "\n", html, flags=re.IGNORECASE)
    html = re.sub(r"<p[^>]*>", "\n", html, flags=re.IGNORECASE)
    html = re.sub(r"<li[^>]*>", "\n- ", html, flags=re.IGNORECASE)
    html = re.sub(r"<h[1-6][^>]*>", "\n\n", html, flags=re.IGNORECASE)
    html = re.sub(r"<[^>]+>", "", html)
    html = re.sub(r"[ \t]+", " ", html)
    html = re.sub(r"\n\s*\n+", "\n\n", html)
    return html.strip()


class WebFetchTool(Tool):
    name = "WebFetch"
    description = "Fetch and extract text content from a URL. Returns readable text from web pages."
    parameters = {
        "properties": {
            "url": {"type": "string", "description": "URL to fetch (http:// or https://)"},
            "timeout": {"type": "integer", "description": "Timeout in seconds (default 30, max 120)"},
        },
        "required": ["url"],
    }

    def execute(self, url: str, timeout: int = 30, **kwargs: Any) -> ToolResult:
        timeout = max(5, min(120, timeout))
        if not url.lower().startswith(("http://", "https://")):
            return ToolResult.failure("URL must start with http:// or https://", code="INVALID_ARGUMENTS")
        try:
            req = urllib.request.Request(
                url,
                headers={
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
                    "Accept": "text/html,application/xhtml+xml,text/plain",
                },
            )
            with urllib.request.urlopen(req, timeout=timeout) as resp:
                content_type = resp.headers.get("Content-Type", "")
                raw = resp.read(500_000)
                charset = "utf-8"
                if "charset=" in content_type:
                    charset = content_type.split("charset=")[-1].split(";")[0].strip()
                try:
                    html = raw.decode(charset)
                except (UnicodeDecodeError, LookupError):
                    html = raw.decode("utf-8", errors="replace")
                normalized_url = url.lower().split("?", 1)[0]
                is_html = "text/html" in content_type.lower() or normalized_url.endswith((".html", ".htm"))
                if is_html:
                    text = _extract_text(html)
                else:
                    text = html
                return ToolResult.success({"text": text[:50_000] or "(no content)", "url": url, "content_type": content_type})
        except urllib.error.HTTPError as e:
            return ToolResult.failure(f"HTTP {e.code} - {e.reason}", code="HTTP_ERROR")
        except urllib.error.URLError as e:
            return ToolResult.failure(f"{e.reason}", code="NETWORK_ERROR")
        except Exception as e:
            return ToolResult.failure(f"{type(e).__name__}: {e}", code="TOOL_EXCEPTION")


TOOL = WebFetchTool