using System.Net;
using System.Text;
using HtmlAgilityPack;
using ReverseMarkdown;
namespace AIStudio.Tools;
public sealed class HTMLParser
{
private static readonly Config MARKDOWN_PARSER_CONFIG = new()
{
UnknownTags = Config.UnknownTagsOption.Bypass,
RemoveComments = true,
SmartHrefHandling = true
};
///
/// Loads the web content from the specified URL.
///
/// The URL of the web page.
/// The web content as text.
public async Task LoadWebContentText(Uri url)
{
var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
var parser = new HtmlWeb();
var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token);
return doc.ParsedText;
}
///
/// Loads the web content from the specified URL and returns it as an HTML string.
///
/// The URL of the web page.
/// The web content as an HTML string.
public async Task LoadWebContentHTML(Uri url)
{
var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
var parser = new HtmlWeb();
var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token);
var innerHtml = doc.DocumentNode.InnerHtml;
return innerHtml;
}
///
/// Converts HTML content to the Markdown format.
///
/// The HTML content to parse.
/// The converted Markdown content.
public string ParseToMarkdown(string html)
{
var markdownConverter = new Converter(MARKDOWN_PARSER_CONFIG);
return markdownConverter.Convert(html);
}
}