2026-05-18 13:33:36 +00:00
using System.Net ;
using System.Net.Sockets ;
2026-04-13 11:53:24 +00:00
using System.Text.Json ;
using System.Text.Json.Nodes ;
2026-05-18 13:33:36 +00:00
using AIStudio.Provider ;
2026-04-13 11:53:24 +00:00
using AIStudio.Tools.PluginSystem ;
using HtmlAgilityPack ;
2026-04-13 16:44:36 +00:00
namespace AIStudio.Tools.ToolCallingSystem.ToolCallingImplementations ;
2026-04-13 11:53:24 +00:00
2026-05-18 13:33:36 +00:00
public sealed class ReadWebPageTool ( HTMLParser htmlParser , ILogger < ReadWebPageTool > logger ) : IToolImplementation
2026-04-13 11:53:24 +00:00
{
2026-05-12 15:20:24 +00:00
private static string TB ( string fallbackEN ) = > I18N . I . T ( fallbackEN , typeof ( ReadWebPageTool ) . Namespace , nameof ( ReadWebPageTool ) ) ;
2026-04-13 11:53:24 +00:00
private const int DEFAULT_TIMEOUT_SECONDS = 30 ;
private const int DEFAULT_MAX_CONTENT_CHARACTERS = 12000 ;
2026-04-13 14:54:26 +00:00
private const int MAX_TRACE_LENGTH = 12000 ;
2026-05-18 13:33:36 +00:00
private const string ALLOWED_PRIVATE_HOSTS_SETTING = "allowedPrivateHosts" ;
2026-04-13 11:53:24 +00:00
private static readonly string [ ] REMOVED_NODE_XPATHS =
[
"//script" ,
"//style" ,
"//noscript" ,
"//nav" ,
"//footer" ,
"//aside" ,
"//form" ,
"//iframe" ,
"//*[@role='navigation']" ,
"//*[@role='contentinfo']" ,
"//*[@role='complementary']"
] ;
public string ImplementationKey = > ToolSelectionRules . READ_WEB_PAGE_TOOL_ID ;
public string Icon = > Icons . Material . Filled . Article ;
public IReadOnlySet < string > SensitiveTraceArgumentNames = > new HashSet < string > ( StringComparer . Ordinal ) ;
2026-05-12 15:20:24 +00:00
public string GetDisplayName ( ) = > TB ( "Read Web Page" ) ;
2026-04-13 11:53:24 +00:00
2026-05-12 15:20:24 +00:00
public string GetDescription ( ) = > TB ( "Load a single web page, extract its main HTML content, and return structured working material for the model. Use the result to synthesize a natural-language answer instead of exposing the raw payload to the user." ) ;
2026-04-13 11:53:24 +00:00
public string GetSettingsFieldLabel ( string fieldName , ToolSettingsFieldDefinition fieldDefinition ) = > fieldName switch
{
2026-05-12 15:20:24 +00:00
"timeoutSeconds" = > TB ( "Timeout Seconds" ) ,
"maxContentCharacters" = > TB ( "Maximum Content Characters" ) ,
2026-05-18 13:33:36 +00:00
ALLOWED_PRIVATE_HOSTS_SETTING = > TB ( "Allowed Private Hosts" ) ,
2026-05-12 15:20:24 +00:00
_ = > TB ( fieldDefinition . Title ) ,
2026-04-13 11:53:24 +00:00
} ;
public string GetSettingsFieldDescription ( string fieldName , ToolSettingsFieldDefinition fieldDefinition ) = > fieldName switch
{
2026-05-12 15:20:24 +00:00
"timeoutSeconds" = > TB ( "Optional HTTP timeout for loading a web page in seconds." ) ,
"maxContentCharacters" = > TB ( "Optional global truncation limit for extracted Markdown returned to the model." ) ,
2026-05-18 13:33:36 +00:00
ALLOWED_PRIVATE_HOSTS_SETTING = > TB ( "Optional host allowlist for private or VPN web pages. Separate host patterns with commas, such as example.de, *.example.de. Allowed private hosts require a High-confidence provider." ) ,
2026-05-12 15:20:24 +00:00
_ = > TB ( fieldDefinition . Description ) ,
2026-04-13 11:53:24 +00:00
} ;
public Task < ToolConfigurationState ? > ValidateConfigurationAsync (
ToolDefinition definition ,
IReadOnlyDictionary < string , string > settingsValues ,
CancellationToken token = default )
{
if ( ! TryReadOptionalPositiveInt ( settingsValues , "timeoutSeconds" , out _ , out var timeoutError ) )
{
return Task . FromResult < ToolConfigurationState ? > ( new ToolConfigurationState
{
IsConfigured = false ,
Message = timeoutError ,
} ) ;
}
if ( ! TryReadOptionalPositiveInt ( settingsValues , "maxContentCharacters" , out _ , out var contentError ) )
{
return Task . FromResult < ToolConfigurationState ? > ( new ToolConfigurationState
{
IsConfigured = false ,
Message = contentError ,
} ) ;
}
2026-05-18 13:33:36 +00:00
if ( ! TryReadAllowedPrivateHostPatterns ( settingsValues . GetValueOrDefault ( ALLOWED_PRIVATE_HOSTS_SETTING ) , out _ , out var allowlistError ) )
{
return Task . FromResult < ToolConfigurationState ? > ( new ToolConfigurationState
{
IsConfigured = false ,
Message = allowlistError ,
} ) ;
}
2026-04-13 11:53:24 +00:00
return Task . FromResult < ToolConfigurationState ? > ( null ) ;
}
public async Task < ToolExecutionResult > ExecuteAsync ( JsonElement arguments , ToolExecutionContext context , CancellationToken token = default )
{
var urlText = ReadRequiredString ( arguments , "url" ) ;
if ( ! Uri . TryCreate ( urlText , UriKind . Absolute , out var url ) | | url is not { Scheme : "http" or "https" } )
throw new ArgumentException ( "Argument 'url' must be a valid HTTP or HTTPS URL." ) ;
var timeoutSeconds = ReadOptionalPositiveIntSetting ( context . SettingsValues , "timeoutSeconds" ) ? ? DEFAULT_TIMEOUT_SECONDS ;
var maxContentCharacters = ReadOptionalPositiveIntSetting ( context . SettingsValues , "maxContentCharacters" ) ? ? DEFAULT_MAX_CONTENT_CHARACTERS ;
2026-05-18 13:33:36 +00:00
if ( ! TryReadAllowedPrivateHostPatterns ( context . SettingsValues . GetValueOrDefault ( ALLOWED_PRIVATE_HOSTS_SETTING ) , out var allowedPrivateHosts , out var allowlistError ) )
throw new InvalidOperationException ( allowlistError ) ;
2026-04-13 11:53:24 +00:00
HTMLParserWebPage page ;
try
{
2026-05-18 13:33:36 +00:00
page = await htmlParser . LoadWebPageAsync (
url ,
token ,
timeoutSeconds ,
2026-05-18 15:00:05 +00:00
async ( candidateUrl , validationToken ) = > await this . ResolveValidatedUrlAddressesAsync ( candidateUrl , allowedPrivateHosts , context . ProviderConfidence , validationToken ) ) ;
2026-04-13 11:53:24 +00:00
}
catch ( OperationCanceledException ) when ( ! token . IsCancellationRequested )
{
throw new TimeoutException ( $"Loading the web page timed out after {timeoutSeconds} seconds." ) ;
}
catch ( HttpRequestException exception )
{
2026-05-18 15:00:05 +00:00
if ( FindBlockedException ( exception ) is { } blockedException )
throw blockedException ;
2026-04-13 11:53:24 +00:00
throw new InvalidOperationException ( $"Loading the web page failed: {exception.Message}" , exception ) ;
}
if ( ! IsSupportedHtmlContentType ( page . ContentType ) )
throw new InvalidOperationException ( $"Unsupported content type '{page.ContentType}'. Only HTML pages are supported." ) ;
var document = page . Document ;
var title = htmlParser . ExtractTitle ( document ) ;
2026-04-13 14:54:26 +00:00
var contentRoot = document . DocumentNode . SelectSingleNode ( "//article" ) ? ?
document . DocumentNode . SelectSingleNode ( "//main" ) ? ?
2026-04-13 11:53:24 +00:00
document . DocumentNode . SelectSingleNode ( "//body" ) ? ?
document . DocumentNode ;
RemoveNoiseNodes ( contentRoot ) ;
var markdown = htmlParser . ParseToMarkdown ( contentRoot . InnerHtml ) . Trim ( ) ;
var warnings = new JsonArray ( ) ;
if ( string . IsNullOrWhiteSpace ( title ) )
warnings . Add ( "No title could be extracted from the page." ) ;
if ( string . IsNullOrWhiteSpace ( markdown ) )
warnings . Add ( "The extracted page content is empty." ) ;
else if ( markdown . Length < 200 )
warnings . Add ( "The extracted page content is very short and may be incomplete." ) ;
if ( markdown . Length > maxContentCharacters )
{
markdown = markdown [ . . maxContentCharacters ] . TrimEnd ( ) ;
warnings . Add ( $"The extracted page content was truncated to {maxContentCharacters} characters." ) ;
}
return new ToolExecutionResult
{
2026-04-13 14:54:26 +00:00
JsonContent = BuildResponseJson ( page , title , markdown , warnings )
} ;
}
private static JsonObject BuildResponseJson ( HTMLParserWebPage page , string title , string markdown , JsonArray warnings )
{
var response = new JsonObject
{
["metadata"] = new JsonObject
2026-04-13 11:53:24 +00:00
{
["url"] = page . RequestedUrl . ToString ( ) ,
["final_url"] = page . FinalUrl . ToString ( ) ,
["title"] = title ,
} ,
2026-04-13 14:54:26 +00:00
["content_markdown"] = markdown ,
2026-04-13 11:53:24 +00:00
} ;
2026-04-13 14:54:26 +00:00
if ( warnings . Count > 0 )
response [ "warnings" ] = warnings ;
return response ;
2026-04-13 11:53:24 +00:00
}
public string FormatTraceResult ( string rawResult )
{
if ( rawResult . Length < = MAX_TRACE_LENGTH )
return rawResult ;
return $"{rawResult[..MAX_TRACE_LENGTH]}..." ;
}
2026-05-18 15:00:05 +00:00
private static ToolExecutionBlockedException ? FindBlockedException ( Exception exception )
{
if ( exception is ToolExecutionBlockedException blockedException )
return blockedException ;
if ( exception is AggregateException aggregateException )
{
foreach ( var innerException in aggregateException . InnerExceptions )
{
if ( FindBlockedException ( innerException ) is { } innerBlockedException )
return innerBlockedException ;
}
}
return exception . InnerException is null ? null : FindBlockedException ( exception . InnerException ) ;
}
private async Task < IReadOnlyList < IPAddress > > ResolveValidatedUrlAddressesAsync (
2026-05-18 13:33:36 +00:00
Uri url ,
IReadOnlyList < AllowedPrivateHostPattern > allowedPrivateHosts ,
ConfidenceLevel providerConfidence ,
CancellationToken token )
{
if ( url is not { Scheme : "http" or "https" } )
throw new ToolExecutionBlockedException ( "Only HTTP and HTTPS URLs are supported." ) ;
if ( IsBlockedHostName ( url . Host ) )
throw new ToolExecutionBlockedException ( "Local web page URLs are not supported." ) ;
var addresses = await ResolveHostAddressesAsync ( url , token ) ;
if ( addresses . Count = = 0 )
throw new InvalidOperationException ( $"The host '{url.Host}' did not resolve to an IP address." ) ;
if ( addresses . Any ( IsNeverAllowedAddress ) )
throw new ToolExecutionBlockedException ( "Local, link-local, multicast, and unspecified network addresses are not supported." ) ;
if ( ! addresses . Any ( IsNonPublicAddress ) )
2026-05-18 15:00:05 +00:00
return addresses ;
2026-05-18 13:33:36 +00:00
if ( ! IsAllowedPrivateHost ( url . Host , allowedPrivateHosts ) )
throw new ToolExecutionBlockedException ( "Private or local-network web page URLs are not supported unless their host is explicitly allowed." ) ;
if ( providerConfidence > = ConfidenceLevel . HIGH )
2026-05-18 15:00:05 +00:00
return addresses ;
2026-05-18 13:33:36 +00:00
await this . ReportPrivateHostProviderBlockAsync ( url , providerConfidence ) ;
throw new ToolExecutionBlockedException ( "This private or VPN web page requires a High-confidence provider." ) ;
}
private async Task ReportPrivateHostProviderBlockAsync ( Uri url , ConfidenceLevel providerConfidence )
{
logger . LogWarning (
"Blocked read_web_page access to allowed private host '{Host}' because provider confidence '{ProviderConfidence}' is below HIGH." ,
url . Host ,
providerConfidence ) ;
await MessageBus . INSTANCE . SendError ( new DataErrorMessage (
Icons . Material . Filled . Security ,
TB ( "The web page was not loaded because private or VPN web pages require a High-confidence provider." ) ) ) ;
}
private static async Task < IReadOnlyList < IPAddress > > ResolveHostAddressesAsync ( Uri url , CancellationToken token )
{
if ( IPAddress . TryParse ( url . Host , out var parsedAddress ) )
return [ NormalizeAddress ( parsedAddress ) ] ;
try
{
return ( await Dns . GetHostAddressesAsync ( url . DnsSafeHost , token ) )
. Select ( NormalizeAddress )
. ToList ( ) ;
}
catch ( SocketException exception )
{
throw new InvalidOperationException ( $"The host '{url.Host}' could not be resolved: {exception.Message}" , exception ) ;
}
}
private static IPAddress NormalizeAddress ( IPAddress address ) = > address . IsIPv4MappedToIPv6 ? address . MapToIPv4 ( ) : address ;
private static bool IsBlockedHostName ( string host )
{
var normalizedHost = NormalizeHost ( host ) ;
return normalizedHost is "localhost" | |
normalizedHost . EndsWith ( ".localhost" , StringComparison . Ordinal ) ;
}
private static bool IsAllowedPrivateHost ( string host , IReadOnlyList < AllowedPrivateHostPattern > allowedPrivateHosts )
{
var normalizedHost = NormalizeHost ( host ) ;
return allowedPrivateHosts . Any ( pattern = > pattern . IsMatch ( normalizedHost ) ) ;
}
private static string NormalizeHost ( string host ) = > host . Trim ( ) . TrimEnd ( '.' ) . ToLowerInvariant ( ) ;
private static bool IsNeverAllowedAddress ( IPAddress address )
{
address = NormalizeAddress ( address ) ;
if ( IPAddress . IsLoopback ( address ) )
return true ;
if ( address . AddressFamily is AddressFamily . InterNetwork )
{
var bytes = address . GetAddressBytes ( ) ;
return address . Equals ( IPAddress . Any ) | |
bytes [ 0 ] is 0 or 127 or > = 224 | |
( bytes [ 0 ] = = 169 & & bytes [ 1 ] = = 254 ) ;
}
if ( address . AddressFamily is AddressFamily . InterNetworkV6 )
{
return address . Equals ( IPAddress . IPv6Any ) | |
address . Equals ( IPAddress . IPv6None ) | |
address . Equals ( IPAddress . IPv6Loopback ) | |
address . IsIPv6LinkLocal | |
address . IsIPv6Multicast ;
}
return true ;
}
private static bool IsNonPublicAddress ( IPAddress address )
{
address = NormalizeAddress ( address ) ;
if ( IsNeverAllowedAddress ( address ) )
return true ;
if ( address . AddressFamily is AddressFamily . InterNetwork )
{
var bytes = address . GetAddressBytes ( ) ;
return bytes [ 0 ] = = 10 | | // Private network: 10.0.0.0/8
( bytes [ 0 ] = = 100 & & bytes [ 1 ] is > = 64 and < = 127 ) | | // Carrier-grade NAT: 100.64.0.0/10
( bytes [ 0 ] = = 172 & & bytes [ 1 ] is > = 16 and < = 31 ) | | // Private network: 172.16.0.0/12
( bytes [ 0 ] = = 192 & & bytes [ 1 ] = = 168 ) | | // Private network: 192.168.0.0/16
( bytes [ 0 ] = = 192 & & bytes [ 1 ] = = 0 & & bytes [ 2 ] = = 0 ) | | // IETF protocol assignments: 192.0.0.0/24
( bytes [ 0 ] = = 192 & & bytes [ 1 ] = = 0 & & bytes [ 2 ] = = 2 ) | | // Documentation range: 192.0.2.0/24
( bytes [ 0 ] = = 198 & & bytes [ 1 ] is 18 or 19 ) | | // Benchmark testing range: 198.18.0.0/15
( bytes [ 0 ] = = 198 & & bytes [ 1 ] = = 51 & & bytes [ 2 ] = = 100 ) | | // Documentation range: 198.51.100.0/24
( bytes [ 0 ] = = 203 & & bytes [ 1 ] = = 0 & & bytes [ 2 ] = = 113 ) ; // Documentation range: 203.0.113.0/24
}
if ( address . AddressFamily is AddressFamily . InterNetworkV6 )
{
var bytes = address . GetAddressBytes ( ) ;
return ( bytes [ 0 ] & 0xfe ) = = 0xfc | | // Unique local addresses: fc00::/7
address . IsIPv6SiteLocal ; // Deprecated site-local addresses: fec0::/10
}
return true ;
}
private static bool TryReadAllowedPrivateHostPatterns (
string? rawValue ,
out List < AllowedPrivateHostPattern > patterns ,
out string error )
{
patterns = [ ] ;
error = string . Empty ;
foreach ( var rawPattern in SplitAllowedPrivateHostPatterns ( rawValue ) )
{
var pattern = NormalizeHost ( rawPattern ) ;
if ( pattern . Contains ( "://" , StringComparison . Ordinal ) | | pattern . Contains ( '/' ) )
{
error = TB ( "Allowed private hosts must be host names only, without scheme or path." ) ;
return false ;
}
var isWildcard = pattern . StartsWith ( "*." , StringComparison . Ordinal ) ;
var host = isWildcard ? pattern [ 2. . ] : pattern ;
if ( string . IsNullOrWhiteSpace ( host ) | | Uri . CheckHostName ( host ) is UriHostNameType . Unknown )
{
error = string . Format ( TB ( "Allowed private host '{0}' is not valid." ) , rawPattern ) ;
return false ;
}
patterns . Add ( new AllowedPrivateHostPattern ( host , isWildcard ) ) ;
}
patterns = patterns
. Distinct ( )
. ToList ( ) ;
return true ;
}
private static IEnumerable < string > SplitAllowedPrivateHostPatterns ( string? rawValue ) = > rawValue ?
. Split ( [ '\r' , '\n' , ',' , ';' ] , StringSplitOptions . RemoveEmptyEntries | StringSplitOptions . TrimEntries )
. Where ( x = > ! string . IsNullOrWhiteSpace ( x ) ) ? ? [ ] ;
2026-04-13 11:53:24 +00:00
private static void RemoveNoiseNodes ( HtmlNode rootNode )
{
foreach ( var xpath in REMOVED_NODE_XPATHS )
{
var nodes = rootNode . SelectNodes ( xpath ) ;
if ( nodes is null )
continue ;
foreach ( var node in nodes . ToList ( ) )
node . Remove ( ) ;
}
}
private static bool IsSupportedHtmlContentType ( string? contentType ) = >
string . IsNullOrWhiteSpace ( contentType ) | |
contentType . StartsWith ( "text/html" , StringComparison . OrdinalIgnoreCase ) | |
contentType . StartsWith ( "application/xhtml+xml" , StringComparison . OrdinalIgnoreCase ) ;
private static string ReadRequiredString ( JsonElement arguments , string propertyName )
{
if ( ! arguments . TryGetProperty ( propertyName , out var value ) | | value . ValueKind is not JsonValueKind . String )
throw new ArgumentException ( $"Missing required argument '{propertyName}'." ) ;
var text = value . GetString ( ) ? . Trim ( ) ? ? string . Empty ;
if ( string . IsNullOrWhiteSpace ( text ) )
throw new ArgumentException ( $"Missing required argument '{propertyName}'." ) ;
return text ;
}
private static int? ReadOptionalPositiveIntSetting ( IReadOnlyDictionary < string , string > settingsValues , string key )
{
if ( ! settingsValues . TryGetValue ( key , out var value ) | | string . IsNullOrWhiteSpace ( value ) )
return null ;
return int . TryParse ( value , out var parsedValue ) & & parsedValue > 0 ? parsedValue : null ;
}
private static bool TryReadOptionalPositiveInt (
IReadOnlyDictionary < string , string > settingsValues ,
string key ,
out int? value ,
out string error )
{
value = null ;
error = string . Empty ;
if ( ! settingsValues . TryGetValue ( key , out var rawValue ) | | string . IsNullOrWhiteSpace ( rawValue ) )
return true ;
if ( int . TryParse ( rawValue , out var parsedValue ) & & parsedValue > 0 )
{
value = parsedValue ;
return true ;
}
error = I18N . I . T ( $"The setting '{key}' must be a positive integer." , typeof ( ReadWebPageTool ) . Namespace , nameof ( ReadWebPageTool ) ) ;
return false ;
}
2026-05-18 13:33:36 +00:00
private readonly record struct AllowedPrivateHostPattern ( string Host , bool IsWildcard )
{
public bool IsMatch ( string normalizedHost ) = >
this . IsWildcard
? normalizedHost . EndsWith ( $".{this.Host}" , StringComparison . Ordinal ) & & normalizedHost . Length > this . Host . Length + 1
: normalizedHost . Equals ( this . Host , StringComparison . Ordinal ) ;
}
2026-04-13 11:53:24 +00:00
}