using System.Net; using System.Net.Http.Headers; using System.Net.Sockets; using System.Text; using HtmlAgilityPack; using ReverseMarkdown; namespace AIStudio.Tools; public sealed class HTMLParser { private const string USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) MindWorkAIStudio/1.0"; private const int MAX_REDIRECTS = 10; private const int DEFAULT_MAX_RESPONSE_BYTES = 5 * 1024 * 1024; private static readonly Config MARKDOWN_PARSER_CONFIG = new() { UnknownTags = Config.UnknownTagsOption.Bypass, RemoveComments = true, SmartHrefHandling = true }; /// /// Loads the web content from the specified URL. /// /// The URL of the web page. /// The web content as text. public async Task LoadWebContentText(Uri url) { var response = await this.LoadWebPageAsync(url); return response.Document.ParsedText; } /// /// Loads the web content from the specified URL and returns it as an HTML string. /// /// The URL of the web page. /// The web content as an HTML string. public async Task LoadWebContentHTML(Uri url) { var response = await this.LoadWebPageAsync(url); var innerHtml = response.Document.DocumentNode.InnerHtml; return innerHtml; } public async Task LoadWebPageAsync( Uri url, CancellationToken token = default, int timeoutSeconds = 30, Func>>? resolveUrlAddressesAsync = null, int maxResponseBytes = DEFAULT_MAX_RESPONSE_BYTES, ExternalWebAuthenticationMode authenticationMode = ExternalWebAuthenticationMode.NONE) { using var handler = new SocketsHttpHandler { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate | DecompressionMethods.Brotli, AllowAutoRedirect = false, }; if (authenticationMode is ExternalWebAuthenticationMode.OS_DEFAULT_CREDENTIALS) handler.Credentials = CreateDefaultCredentialCache(url); if (resolveUrlAddressesAsync is not null) { // The callback binds the request to a vetted target IP; a proxy would change the endpoint being connected to. handler.UseProxy = false; handler.ConnectCallback = async (context, connectionToken) => await ConnectToResolvedAddressAsync(context, resolveUrlAddressesAsync, connectionToken); } using var httpClient = new HttpClient(handler) { Timeout = Timeout.InfiniteTimeSpan, }; using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(token); timeoutCts.CancelAfter(TimeSpan.FromSeconds(timeoutSeconds)); var currentUrl = url; for (var redirectCount = 0; redirectCount <= MAX_REDIRECTS; redirectCount++) { ValidateHttpOrHttpsUrl(currentUrl); using var request = CreateRequest(currentUrl); using var response = await httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, timeoutCts.Token); if (IsRedirect(response.StatusCode)) { if (response.Headers.Location is null) throw new HttpRequestException($"The server returned a redirect without a Location header for '{currentUrl}'.", null, response.StatusCode); currentUrl = response.Headers.Location.IsAbsoluteUri ? response.Headers.Location : new Uri(currentUrl, response.Headers.Location); continue; } if (!response.IsSuccessStatusCode) { var statusCode = (int)response.StatusCode; var reasonPhrase = string.IsNullOrWhiteSpace(response.ReasonPhrase) ? "Unknown" : response.ReasonPhrase; throw new HttpRequestException($"The server returned HTTP {statusCode} ({reasonPhrase}) for '{currentUrl}'.", null, response.StatusCode); } var html = await ReadContentAsStringWithLimitAsync(response.Content, maxResponseBytes, timeoutCts.Token); var document = new HtmlDocument(); document.LoadHtml(html); return new HTMLParserWebPage { RequestedUrl = url, FinalUrl = response.RequestMessage?.RequestUri ?? currentUrl, ContentType = response.Content.Headers.ContentType?.MediaType ?? string.Empty, Document = document, }; } throw new HttpRequestException($"The server returned more than {MAX_REDIRECTS} redirects for '{url}'."); } private static CredentialCache CreateDefaultCredentialCache(Uri url) { var credentialCache = new CredentialCache(); var uriPrefix = new UriBuilder(url.Scheme, url.Host, url.Port).Uri; credentialCache.Add(uriPrefix, "Negotiate", CredentialCache.DefaultNetworkCredentials); credentialCache.Add(uriPrefix, "NTLM", CredentialCache.DefaultNetworkCredentials); credentialCache.Add(uriPrefix, "Kerberos", CredentialCache.DefaultNetworkCredentials); return credentialCache; } private static void ValidateHttpOrHttpsUrl(Uri url) { if (url.Scheme.Equals(Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) || url.Scheme.Equals(Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase)) return; throw new HttpRequestException($"Unsupported URL scheme '{url.Scheme}' for '{url}'."); } private static async ValueTask ConnectToResolvedAddressAsync( SocketsHttpConnectionContext context, Func>> resolveUrlAddressesAsync, CancellationToken token) { var requestUri = context.InitialRequestMessage.RequestUri ?? throw new HttpRequestException("The HTTP request did not contain a target URL."); var addresses = await resolveUrlAddressesAsync(requestUri, token); if (addresses.Count == 0) throw new HttpRequestException($"The host '{requestUri.Host}' did not resolve to an IP address."); List connectionErrors = []; foreach (var address in addresses.Distinct()) { var socket = new Socket(address.AddressFamily, SocketType.Stream, ProtocolType.Tcp) { NoDelay = true, }; try { await socket.ConnectAsync(new IPEndPoint(address, context.DnsEndPoint.Port), token); return new NetworkStream(socket, ownsSocket: true); } catch (SocketException exception) { connectionErrors.Add(exception); socket.Dispose(); } catch { socket.Dispose(); throw; } } Exception innerException = connectionErrors.Count == 1 ? connectionErrors[0] : new AggregateException(connectionErrors); throw new HttpRequestException($"Could not connect to a validated address for '{requestUri.Host}'.", innerException); } private static HttpRequestMessage CreateRequest(Uri url) { var request = new HttpRequestMessage(HttpMethod.Get, url); request.Headers.TryAddWithoutValidation("User-Agent", USER_AGENT); request.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("text/html")); request.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("application/xhtml+xml")); request.Headers.AcceptLanguage.Add(new StringWithQualityHeaderValue("en-US")); request.Headers.AcceptLanguage.Add(new StringWithQualityHeaderValue("en", 0.9)); request.Headers.AcceptEncoding.Add(new StringWithQualityHeaderValue("gzip")); request.Headers.AcceptEncoding.Add(new StringWithQualityHeaderValue("deflate")); request.Headers.AcceptEncoding.Add(new StringWithQualityHeaderValue("br")); request.Headers.TryAddWithoutValidation("Upgrade-Insecure-Requests", "1"); request.Headers.TryAddWithoutValidation("Sec-Fetch-Site", "none"); request.Headers.TryAddWithoutValidation("Sec-Fetch-Mode", "navigate"); request.Headers.TryAddWithoutValidation("Sec-Fetch-Dest", "document"); request.Headers.TryAddWithoutValidation("Sec-Fetch-User", "?1"); return request; } private static bool IsRedirect(HttpStatusCode statusCode) => (int)statusCode is >= 300 and <= 399; private static async Task ReadContentAsStringWithLimitAsync(HttpContent content, int maxResponseBytes, CancellationToken token) { if (content.Headers.ContentLength is long contentLength && contentLength > maxResponseBytes) throw new HttpRequestException($"The response body is too large. Maximum allowed size is {maxResponseBytes} bytes."); await using var stream = await content.ReadAsStreamAsync(token); await using var buffer = new MemoryStream(); var chunk = new byte[8192]; while (true) { var read = await stream.ReadAsync(chunk, token); if (read == 0) break; if (buffer.Length + read > maxResponseBytes) throw new HttpRequestException($"The response body is too large. Maximum allowed size is {maxResponseBytes} bytes."); buffer.Write(chunk, 0, read); } var encoding = TryGetContentEncoding(content) ?? Encoding.UTF8; return encoding.GetString(buffer.ToArray()); } private static Encoding? TryGetContentEncoding(HttpContent content) { var charset = content.Headers.ContentType?.CharSet?.Trim(); if (string.IsNullOrWhiteSpace(charset)) return null; try { return Encoding.GetEncoding(charset.Trim('"')); } catch { return null; } } public string ExtractTitle(HtmlDocument document) { var title = document.DocumentNode.SelectSingleNode("//title")?.InnerText?.Trim(); return WebUtility.HtmlDecode(title ?? string.Empty).Trim(); } /// /// Converts HTML content to the Markdown format. /// /// The HTML content to parse. /// The converted Markdown content. public string ParseToMarkdown(string html) { var markdownConverter = new Converter(MARKDOWN_PARSER_CONFIG); return markdownConverter.Convert(html); } } public sealed class HTMLParserWebPage { public required Uri RequestedUrl { get; init; } public required Uri FinalUrl { get; init; } public required string ContentType { get; init; } public required HtmlDocument Document { get; init; } } public enum ExternalWebAuthenticationMode { NONE, OS_DEFAULT_CREDENTIALS }