using System.Net; using System.Text; using HtmlAgilityPack; using ReverseMarkdown; namespace AIStudio.Tools; public sealed class HTMLParser { private static readonly Config MARKDOWN_PARSER_CONFIG = new() { UnknownTags = Config.UnknownTagsOption.Bypass, RemoveComments = true, SmartHrefHandling = true }; /// /// Loads the web content from the specified URL. /// /// The URL of the web page. /// The web content as text. public async Task LoadWebContentText(Uri url) { var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); var parser = new HtmlWeb(); var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token); return doc.ParsedText; } /// /// Loads the web content from the specified URL and returns it as an HTML string. /// /// The URL of the web page. /// The web content as an HTML string. public async Task LoadWebContentHTML(Uri url) { var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); var parser = new HtmlWeb(); var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token); var innerHtml = doc.DocumentNode.InnerHtml; return innerHtml; } /// /// Converts HTML content to the Markdown format. /// /// The HTML content to parse. /// The converted Markdown content. public string ParseToMarkdown(string html) { var markdownConverter = new Converter(MARKDOWN_PARSER_CONFIG); return markdownConverter.Convert(html); } }