using System.Net; using System.Net.Sockets; using System.Text.Json; using System.Text.Json.Nodes; using AIStudio.Provider; using AIStudio.Tools.PluginSystem; using HtmlAgilityPack; namespace AIStudio.Tools.ToolCallingSystem.ToolCallingImplementations; public sealed class ReadWebPageTool(HTMLParser htmlParser, ILogger logger) : IToolImplementation { private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(ReadWebPageTool).Namespace, nameof(ReadWebPageTool)); private const int DEFAULT_TIMEOUT_SECONDS = 30; private const int DEFAULT_MAX_CONTENT_CHARACTERS = 12000; private const int MAX_TRACE_LENGTH = 12000; private const string ALLOWED_PRIVATE_HOSTS_SETTING = "allowedPrivateHosts"; private static readonly string[] REMOVED_NODE_XPATHS = [ "//script", "//style", "//noscript", "//nav", "//footer", "//aside", "//form", "//iframe", "//*[@role='navigation']", "//*[@role='contentinfo']", "//*[@role='complementary']" ]; public string ImplementationKey => ToolSelectionRules.READ_WEB_PAGE_TOOL_ID; public string Icon => Icons.Material.Filled.Article; public IReadOnlySet SensitiveTraceArgumentNames => new HashSet(StringComparer.Ordinal); public string GetDisplayName() => TB("Read Web Page"); public string GetDescription() => TB("Load a single web page, extract its main HTML content, and return structured working material for the model. Use the result to synthesize a natural-language answer instead of exposing the raw payload to the user."); public string GetSettingsFieldLabel(string fieldName, ToolSettingsFieldDefinition fieldDefinition) => fieldName switch { "timeoutSeconds" => TB("Timeout Seconds"), "maxContentCharacters" => TB("Maximum Content Characters"), ALLOWED_PRIVATE_HOSTS_SETTING => TB("Allowed Private Hosts"), _ => TB(fieldDefinition.Title), }; public string GetSettingsFieldDescription(string fieldName, ToolSettingsFieldDefinition fieldDefinition) => fieldName switch { "timeoutSeconds" => TB("Optional HTTP timeout for loading a web page in seconds."), "maxContentCharacters" => TB("Optional global truncation limit for extracted Markdown returned to the model."), ALLOWED_PRIVATE_HOSTS_SETTING => TB("Optional host allowlist for private or VPN web pages. Separate host patterns with commas, such as example.de, *.example.de. Allowed private hosts require a High-confidence provider."), _ => TB(fieldDefinition.Description), }; public Task ValidateConfigurationAsync( ToolDefinition definition, IReadOnlyDictionary settingsValues, CancellationToken token = default) { if (!TryReadOptionalPositiveInt(settingsValues, "timeoutSeconds", out _, out var timeoutError)) { return Task.FromResult(new ToolConfigurationState { IsConfigured = false, Message = timeoutError, }); } if (!TryReadOptionalPositiveInt(settingsValues, "maxContentCharacters", out _, out var contentError)) { return Task.FromResult(new ToolConfigurationState { IsConfigured = false, Message = contentError, }); } if (!TryReadAllowedPrivateHostPatterns(settingsValues.GetValueOrDefault(ALLOWED_PRIVATE_HOSTS_SETTING), out _, out var allowlistError)) { return Task.FromResult(new ToolConfigurationState { IsConfigured = false, Message = allowlistError, }); } return Task.FromResult(null); } public async Task ExecuteAsync(JsonElement arguments, ToolExecutionContext context, CancellationToken token = default) { var urlText = ReadRequiredString(arguments, "url"); if (!Uri.TryCreate(urlText, UriKind.Absolute, out var url) || url is not { Scheme: "http" or "https" }) throw new ArgumentException("Argument 'url' must be a valid HTTP or HTTPS URL."); var timeoutSeconds = ReadOptionalPositiveIntSetting(context.SettingsValues, "timeoutSeconds") ?? DEFAULT_TIMEOUT_SECONDS; var maxContentCharacters = ReadOptionalPositiveIntSetting(context.SettingsValues, "maxContentCharacters") ?? DEFAULT_MAX_CONTENT_CHARACTERS; if (!TryReadAllowedPrivateHostPatterns(context.SettingsValues.GetValueOrDefault(ALLOWED_PRIVATE_HOSTS_SETTING), out var allowedPrivateHosts, out var allowlistError)) throw new InvalidOperationException(allowlistError); HTMLParserWebPage page; try { page = await htmlParser.LoadWebPageAsync( url, token, timeoutSeconds, async (candidateUrl, validationToken) => await this.ValidateUrlAccessAsync(candidateUrl, allowedPrivateHosts, context.ProviderConfidence, validationToken)); } catch (OperationCanceledException) when (!token.IsCancellationRequested) { throw new TimeoutException($"Loading the web page timed out after {timeoutSeconds} seconds."); } catch (HttpRequestException exception) { throw new InvalidOperationException($"Loading the web page failed: {exception.Message}", exception); } if (!IsSupportedHtmlContentType(page.ContentType)) throw new InvalidOperationException($"Unsupported content type '{page.ContentType}'. Only HTML pages are supported."); var document = page.Document; var title = htmlParser.ExtractTitle(document); var contentRoot = document.DocumentNode.SelectSingleNode("//article") ?? document.DocumentNode.SelectSingleNode("//main") ?? document.DocumentNode.SelectSingleNode("//body") ?? document.DocumentNode; RemoveNoiseNodes(contentRoot); var markdown = htmlParser.ParseToMarkdown(contentRoot.InnerHtml).Trim(); var warnings = new JsonArray(); if (string.IsNullOrWhiteSpace(title)) warnings.Add("No title could be extracted from the page."); if (string.IsNullOrWhiteSpace(markdown)) warnings.Add("The extracted page content is empty."); else if (markdown.Length < 200) warnings.Add("The extracted page content is very short and may be incomplete."); if (markdown.Length > maxContentCharacters) { markdown = markdown[..maxContentCharacters].TrimEnd(); warnings.Add($"The extracted page content was truncated to {maxContentCharacters} characters."); } return new ToolExecutionResult { JsonContent = BuildResponseJson(page, title, markdown, warnings) }; } private static JsonObject BuildResponseJson(HTMLParserWebPage page, string title, string markdown, JsonArray warnings) { var response = new JsonObject { ["metadata"] = new JsonObject { ["url"] = page.RequestedUrl.ToString(), ["final_url"] = page.FinalUrl.ToString(), ["title"] = title, }, ["content_markdown"] = markdown, }; if (warnings.Count > 0) response["warnings"] = warnings; return response; } public string FormatTraceResult(string rawResult) { if (rawResult.Length <= MAX_TRACE_LENGTH) return rawResult; return $"{rawResult[..MAX_TRACE_LENGTH]}..."; } private async Task ValidateUrlAccessAsync( Uri url, IReadOnlyList allowedPrivateHosts, ConfidenceLevel providerConfidence, CancellationToken token) { if (url is not { Scheme: "http" or "https" }) throw new ToolExecutionBlockedException("Only HTTP and HTTPS URLs are supported."); if (IsBlockedHostName(url.Host)) throw new ToolExecutionBlockedException("Local web page URLs are not supported."); var addresses = await ResolveHostAddressesAsync(url, token); if (addresses.Count == 0) throw new InvalidOperationException($"The host '{url.Host}' did not resolve to an IP address."); if (addresses.Any(IsNeverAllowedAddress)) throw new ToolExecutionBlockedException("Local, link-local, multicast, and unspecified network addresses are not supported."); if (!addresses.Any(IsNonPublicAddress)) return; if (!IsAllowedPrivateHost(url.Host, allowedPrivateHosts)) throw new ToolExecutionBlockedException("Private or local-network web page URLs are not supported unless their host is explicitly allowed."); if (providerConfidence >= ConfidenceLevel.HIGH) return; await this.ReportPrivateHostProviderBlockAsync(url, providerConfidence); throw new ToolExecutionBlockedException("This private or VPN web page requires a High-confidence provider."); } private async Task ReportPrivateHostProviderBlockAsync(Uri url, ConfidenceLevel providerConfidence) { logger.LogWarning( "Blocked read_web_page access to allowed private host '{Host}' because provider confidence '{ProviderConfidence}' is below HIGH.", url.Host, providerConfidence); await MessageBus.INSTANCE.SendError(new DataErrorMessage( Icons.Material.Filled.Security, TB("The web page was not loaded because private or VPN web pages require a High-confidence provider."))); } private static async Task> ResolveHostAddressesAsync(Uri url, CancellationToken token) { if (IPAddress.TryParse(url.Host, out var parsedAddress)) return [NormalizeAddress(parsedAddress)]; try { return (await Dns.GetHostAddressesAsync(url.DnsSafeHost, token)) .Select(NormalizeAddress) .ToList(); } catch (SocketException exception) { throw new InvalidOperationException($"The host '{url.Host}' could not be resolved: {exception.Message}", exception); } } private static IPAddress NormalizeAddress(IPAddress address) => address.IsIPv4MappedToIPv6 ? address.MapToIPv4() : address; private static bool IsBlockedHostName(string host) { var normalizedHost = NormalizeHost(host); return normalizedHost is "localhost" || normalizedHost.EndsWith(".localhost", StringComparison.Ordinal); } private static bool IsAllowedPrivateHost(string host, IReadOnlyList allowedPrivateHosts) { var normalizedHost = NormalizeHost(host); return allowedPrivateHosts.Any(pattern => pattern.IsMatch(normalizedHost)); } private static string NormalizeHost(string host) => host.Trim().TrimEnd('.').ToLowerInvariant(); private static bool IsNeverAllowedAddress(IPAddress address) { address = NormalizeAddress(address); if (IPAddress.IsLoopback(address)) return true; if (address.AddressFamily is AddressFamily.InterNetwork) { var bytes = address.GetAddressBytes(); return address.Equals(IPAddress.Any) || bytes[0] is 0 or 127 or >= 224 || (bytes[0] == 169 && bytes[1] == 254); } if (address.AddressFamily is AddressFamily.InterNetworkV6) { return address.Equals(IPAddress.IPv6Any) || address.Equals(IPAddress.IPv6None) || address.Equals(IPAddress.IPv6Loopback) || address.IsIPv6LinkLocal || address.IsIPv6Multicast; } return true; } private static bool IsNonPublicAddress(IPAddress address) { address = NormalizeAddress(address); if (IsNeverAllowedAddress(address)) return true; if (address.AddressFamily is AddressFamily.InterNetwork) { var bytes = address.GetAddressBytes(); return bytes[0] == 10 || // Private network: 10.0.0.0/8 (bytes[0] == 100 && bytes[1] is >= 64 and <= 127) || // Carrier-grade NAT: 100.64.0.0/10 (bytes[0] == 172 && bytes[1] is >= 16 and <= 31) || // Private network: 172.16.0.0/12 (bytes[0] == 192 && bytes[1] == 168) || // Private network: 192.168.0.0/16 (bytes[0] == 192 && bytes[1] == 0 && bytes[2] == 0) || // IETF protocol assignments: 192.0.0.0/24 (bytes[0] == 192 && bytes[1] == 0 && bytes[2] == 2) || // Documentation range: 192.0.2.0/24 (bytes[0] == 198 && bytes[1] is 18 or 19) || // Benchmark testing range: 198.18.0.0/15 (bytes[0] == 198 && bytes[1] == 51 && bytes[2] == 100) || // Documentation range: 198.51.100.0/24 (bytes[0] == 203 && bytes[1] == 0 && bytes[2] == 113); // Documentation range: 203.0.113.0/24 } if (address.AddressFamily is AddressFamily.InterNetworkV6) { var bytes = address.GetAddressBytes(); return (bytes[0] & 0xfe) == 0xfc || // Unique local addresses: fc00::/7 address.IsIPv6SiteLocal; // Deprecated site-local addresses: fec0::/10 } return true; } private static bool TryReadAllowedPrivateHostPatterns( string? rawValue, out List patterns, out string error) { patterns = []; error = string.Empty; foreach (var rawPattern in SplitAllowedPrivateHostPatterns(rawValue)) { var pattern = NormalizeHost(rawPattern); if (pattern.Contains("://", StringComparison.Ordinal) || pattern.Contains('/')) { error = TB("Allowed private hosts must be host names only, without scheme or path."); return false; } var isWildcard = pattern.StartsWith("*.", StringComparison.Ordinal); var host = isWildcard ? pattern[2..] : pattern; if (string.IsNullOrWhiteSpace(host) || Uri.CheckHostName(host) is UriHostNameType.Unknown) { error = string.Format(TB("Allowed private host '{0}' is not valid."), rawPattern); return false; } patterns.Add(new AllowedPrivateHostPattern(host, isWildcard)); } patterns = patterns .Distinct() .ToList(); return true; } private static IEnumerable SplitAllowedPrivateHostPatterns(string? rawValue) => rawValue? .Split(['\r', '\n', ',', ';'], StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) .Where(x => !string.IsNullOrWhiteSpace(x)) ?? []; private static void RemoveNoiseNodes(HtmlNode rootNode) { foreach (var xpath in REMOVED_NODE_XPATHS) { var nodes = rootNode.SelectNodes(xpath); if (nodes is null) continue; foreach (var node in nodes.ToList()) node.Remove(); } } private static bool IsSupportedHtmlContentType(string? contentType) => string.IsNullOrWhiteSpace(contentType) || contentType.StartsWith("text/html", StringComparison.OrdinalIgnoreCase) || contentType.StartsWith("application/xhtml+xml", StringComparison.OrdinalIgnoreCase); private static string ReadRequiredString(JsonElement arguments, string propertyName) { if (!arguments.TryGetProperty(propertyName, out var value) || value.ValueKind is not JsonValueKind.String) throw new ArgumentException($"Missing required argument '{propertyName}'."); var text = value.GetString()?.Trim() ?? string.Empty; if (string.IsNullOrWhiteSpace(text)) throw new ArgumentException($"Missing required argument '{propertyName}'."); return text; } private static int? ReadOptionalPositiveIntSetting(IReadOnlyDictionary settingsValues, string key) { if (!settingsValues.TryGetValue(key, out var value) || string.IsNullOrWhiteSpace(value)) return null; return int.TryParse(value, out var parsedValue) && parsedValue > 0 ? parsedValue : null; } private static bool TryReadOptionalPositiveInt( IReadOnlyDictionary settingsValues, string key, out int? value, out string error) { value = null; error = string.Empty; if (!settingsValues.TryGetValue(key, out var rawValue) || string.IsNullOrWhiteSpace(rawValue)) return true; if (int.TryParse(rawValue, out var parsedValue) && parsedValue > 0) { value = parsedValue; return true; } error = I18N.I.T($"The setting '{key}' must be a positive integer.", typeof(ReadWebPageTool).Namespace, nameof(ReadWebPageTool)); return false; } private readonly record struct AllowedPrivateHostPattern(string Host, bool IsWildcard) { public bool IsMatch(string normalizedHost) => this.IsWildcard ? normalizedHost.EndsWith($".{this.Host}", StringComparison.Ordinal) && normalizedHost.Length > this.Host.Length + 1 : normalizedHost.Equals(this.Host, StringComparison.Ordinal); } }