using System.Net; using System.Net.Http; using System.Text; using HtmlAgilityPack; using ReverseMarkdown; namespace AIStudio.Tools; public sealed class HTMLParser { private static readonly Config MARKDOWN_PARSER_CONFIG = new() { UnknownTags = Config.UnknownTagsOption.Bypass, RemoveComments = true, SmartHrefHandling = true }; /// /// Loads the web content from the specified URL. /// /// The URL of the web page. /// The web content as text. public async Task LoadWebContentText(Uri url) { var response = await this.LoadWebPageAsync(url); return response.Document.ParsedText; } /// /// Loads the web content from the specified URL and returns it as an HTML string. /// /// The URL of the web page. /// The web content as an HTML string. public async Task LoadWebContentHTML(Uri url) { var response = await this.LoadWebPageAsync(url); var innerHtml = response.Document.DocumentNode.InnerHtml; return innerHtml; } public async Task LoadWebPageAsync(Uri url, CancellationToken token = default, int timeoutSeconds = 30) { using var httpClient = new HttpClient { Timeout = Timeout.InfiniteTimeSpan, }; using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(token); timeoutCts.CancelAfter(TimeSpan.FromSeconds(timeoutSeconds)); using var response = await httpClient.GetAsync(url, timeoutCts.Token); response.EnsureSuccessStatusCode(); var html = await response.Content.ReadAsStringAsync(token); var document = new HtmlDocument(); document.LoadHtml(html); return new HTMLParserWebPage { RequestedUrl = url, FinalUrl = response.RequestMessage?.RequestUri ?? url, ContentType = response.Content.Headers.ContentType?.MediaType ?? string.Empty, Document = document, }; } public string ExtractTitle(HtmlDocument document) { var title = document.DocumentNode.SelectSingleNode("//title")?.InnerText?.Trim(); return WebUtility.HtmlDecode(title ?? string.Empty).Trim(); } /// /// Converts HTML content to the Markdown format. /// /// The HTML content to parse. /// The converted Markdown content. public string ParseToMarkdown(string html) { var markdownConverter = new Converter(MARKDOWN_PARSER_CONFIG); return markdownConverter.Convert(html); } } public sealed class HTMLParserWebPage { public required Uri RequestedUrl { get; init; } public required Uri FinalUrl { get; init; } public required string ContentType { get; init; } public required HtmlDocument Document { get; init; } }