diff --git a/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor b/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor new file mode 100644 index 00000000..8b33a12e --- /dev/null +++ b/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor @@ -0,0 +1,31 @@ + + + + @(this.showWebContentReader ? "Show web content options" : "Hide web content options") + + + + @if (this.showWebContentReader) + { + + + @(this.useContentCleanerAgent ? "The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used." : "No content cleaning") + + + + + + Fetch + + + @if (this.AgentIsRunning) + { +
+ +
+
+ +
+ } + } +
\ No newline at end of file diff --git a/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor.cs b/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor.cs new file mode 100644 index 00000000..56cbd60e --- /dev/null +++ b/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor.cs @@ -0,0 +1,198 @@ +using AIStudio.Agents; +using AIStudio.Chat; +using AIStudio.Settings; +using AIStudio.Tools; + +using Microsoft.AspNetCore.Components; + +namespace AIStudio.Components.Blocks; + +public partial class ReadWebContent : ComponentBase +{ + [Inject] + private HTMLParser HTMLParser { get; init; } = null!; + + [Inject] + private AgentTextContentCleaner AgentTextContentCleaner { get; init; } = null!; + + [Inject] + protected SettingsManager SettingsManager { get; set; } = null!; + + [Inject] + protected IJSRuntime JsRuntime { get; init; } = null!; + + [Parameter] + public string Content { get; set; } = string.Empty; + + [Parameter] + public EventCallback ContentChanged { get; set; } + + [Parameter] + public Settings.Provider ProviderSettings { get; set; } + + [Parameter] + public bool AgentIsRunning { get; set; } + + [Parameter] + public EventCallback AgentIsRunningChanged { get; set; } + + [Parameter] + public bool Preselect { get; set; } + + [Parameter] + public bool PreselectContentCleanerAgent { get; set; } + + private Process process = Process.INSTANCE; + private ProcessStepValue processStep; + + private bool showWebContentReader; + private bool useContentCleanerAgent; + private string providedURL = string.Empty; + private bool urlIsValid; + private bool isProviderValid; + + private Settings.Provider providerSettings; + + #region Overrides of ComponentBase + + protected override async Task OnInitializedAsync() + { + if(this.Preselect) + this.showWebContentReader = true; + + if(this.PreselectContentCleanerAgent) + this.useContentCleanerAgent = true; + + if (this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions) + this.providerSettings = this.SettingsManager.ConfigurationData.Providers.FirstOrDefault(x => x.Id == this.SettingsManager.ConfigurationData.PreselectedAgentTextContentCleanerProvider); + else + this.providerSettings = this.ProviderSettings; + + await base.OnInitializedAsync(); + } + + protected override async Task OnParametersSetAsync() + { + if (!this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions) + this.providerSettings = this.ProviderSettings; + + await base.OnParametersSetAsync(); + } + + #endregion + + private async Task LoadFromWeb() + { + if(!this.IsReady) + return; + + var markdown = string.Empty; + try + { + this.processStep = this.process[ReadWebContentSteps.LOADING]; + this.StateHasChanged(); + + var html = await this.HTMLParser.LoadWebContentHTML(new Uri(this.providedURL)); + + this.processStep = this.process[ReadWebContentSteps.PARSING]; + this.StateHasChanged(); + markdown = this.HTMLParser.ParseToMarkdown(html); + + if (this.useContentCleanerAgent) + { + this.AgentTextContentCleaner.ProviderSettings = this.providerSettings; + var additionalData = new Dictionary + { + { "sourceURL", this.providedURL }, + }; + + this.processStep = this.process[ReadWebContentSteps.CLEANING]; + this.AgentIsRunning = true; + await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning); + this.StateHasChanged(); + + var contentBlock = await this.AgentTextContentCleaner.ProcessInput(new ContentBlock + { + Time = DateTimeOffset.UtcNow, + ContentType = ContentType.TEXT, + Role = ChatRole.USER, + Content = new ContentText + { + Text = markdown, + }, + }, additionalData); + + markdown = contentBlock.Content is ContentText text ? text.Text : markdown; + + this.processStep = this.process[ReadWebContentSteps.DONE]; + this.AgentIsRunning = false; + await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning); + this.StateHasChanged(); + } + } + catch + { + if (this.AgentIsRunning) + { + this.processStep = this.process[ReadWebContentSteps.START]; + this.AgentIsRunning = false; + await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning); + this.StateHasChanged(); + } + } + + this.Content = markdown; + await this.ContentChanged.InvokeAsync(this.Content); + } + + private bool IsReady + { + get + { + if(!this.urlIsValid) + return false; + + if(this.useContentCleanerAgent && !this.isProviderValid) + return false; + + return true; + } + } + + private string? ValidateProvider(bool shouldUseAgent) + { + if(shouldUseAgent && this.providerSettings == default) + { + this.isProviderValid = false; + return "Please select a provider to use the cleanup agent."; + } + + this.isProviderValid = true; + return null; + } + + private string? ValidateURL(string url) + { + if(string.IsNullOrWhiteSpace(url)) + { + this.urlIsValid = false; + return "Please provide a URL to load the content from."; + } + + var urlParsingResult = Uri.TryCreate(url, UriKind.Absolute, out var uriResult); + if(!urlParsingResult) + { + this.urlIsValid = false; + return "Please provide a valid URL."; + } + + if(uriResult is not { Scheme: "http" or "https" }) + { + this.urlIsValid = false; + return "Please provide a valid HTTP or HTTPS URL."; + } + + this.urlIsValid = true; + return null; + } +} \ No newline at end of file diff --git a/app/MindWork AI Studio/Components/Blocks/ReadWebContentSteps.cs b/app/MindWork AI Studio/Components/Blocks/ReadWebContentSteps.cs new file mode 100644 index 00000000..3dafa015 --- /dev/null +++ b/app/MindWork AI Studio/Components/Blocks/ReadWebContentSteps.cs @@ -0,0 +1,10 @@ +namespace AIStudio.Components.Blocks; + +public enum ReadWebContentSteps +{ + START, + LOADING, + PARSING, + CLEANING, + DONE, +} diff --git a/app/MindWork AI Studio/Components/Pages/About.razor b/app/MindWork AI Studio/Components/Pages/About.razor index 6aa02bab..0f29700f 100644 --- a/app/MindWork AI Studio/Components/Pages/About.razor +++ b/app/MindWork AI Studio/Components/Pages/About.razor @@ -47,6 +47,8 @@ + + diff --git a/app/MindWork AI Studio/Components/Pages/Settings.razor b/app/MindWork AI Studio/Components/Pages/Settings.razor index 6551a722..138ab751 100644 --- a/app/MindWork AI Studio/Components/Pages/Settings.razor +++ b/app/MindWork AI Studio/Components/Pages/Settings.razor @@ -131,5 +131,17 @@ } + + LLM Agent Options + + Text Content Cleaner Agent + + + Use Case: this agent is used to clean up text content. It extracts the main content, removes advertisements and other irrelevant things, + and attempts to convert relative links into absolute links so that they can be used. + + + + \ No newline at end of file diff --git a/app/MindWork AI Studio/MindWork AI Studio.csproj b/app/MindWork AI Studio/MindWork AI Studio.csproj index 9d6605a2..a24bc833 100644 --- a/app/MindWork AI Studio/MindWork AI Studio.csproj +++ b/app/MindWork AI Studio/MindWork AI Studio.csproj @@ -45,9 +45,11 @@ + + diff --git a/app/MindWork AI Studio/Program.cs b/app/MindWork AI Studio/Program.cs index 6bd7e0a7..5bc114a2 100644 --- a/app/MindWork AI Studio/Program.cs +++ b/app/MindWork AI Studio/Program.cs @@ -31,6 +31,7 @@ builder.Services.AddSingleton(); builder.Services.AddMudMarkdownClipboardService(); builder.Services.AddSingleton(); builder.Services.AddSingleton(); +builder.Services.AddTransient(); builder.Services.AddTransient(); builder.Services.AddHostedService(); builder.Services.AddHostedService(); diff --git a/app/MindWork AI Studio/Settings/DataModel/Data.cs b/app/MindWork AI Studio/Settings/DataModel/Data.cs index 26639b6a..12ff3533 100644 --- a/app/MindWork AI Studio/Settings/DataModel/Data.cs +++ b/app/MindWork AI Studio/Settings/DataModel/Data.cs @@ -199,4 +199,18 @@ public sealed class Data public string PreselectedTextSummarizerProvider { get; set; } = string.Empty; #endregion + + #region Agent: Text Content Cleaner Settings + + /// + /// Preselect any text content cleaner options? + /// + public bool PreselectAgentTextContentCleanerOptions { get; set; } + + /// + /// Preselect a text content cleaner provider? + /// + public string PreselectedAgentTextContentCleanerProvider { get; set; } = string.Empty; + + #endregion } \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/HTMLParser.cs b/app/MindWork AI Studio/Tools/HTMLParser.cs new file mode 100644 index 00000000..4f9dca2a --- /dev/null +++ b/app/MindWork AI Studio/Tools/HTMLParser.cs @@ -0,0 +1,57 @@ +using System.Net; +using System.Text; + +using HtmlAgilityPack; + +using ReverseMarkdown; + +namespace AIStudio.Tools; + +public sealed class HTMLParser +{ + private static readonly Config MARKDOWN_PARSER_CONFIG = new() + { + UnknownTags = Config.UnknownTagsOption.Bypass, + RemoveComments = true, + SmartHrefHandling = true + }; + + /// + /// Loads the web content from the specified URL. + /// + /// The URL of the web page. + /// The web content as text. + public async Task LoadWebContentText(Uri url) + { + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + var parser = new HtmlWeb(); + var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token); + return doc.ParsedText; + } + + /// + /// Loads the web content from the specified URL and returns it as an HTML string. + /// + /// The URL of the web page. + /// The web content as an HTML string. + public async Task LoadWebContentHTML(Uri url) + { + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + var parser = new HtmlWeb(); + var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token); + var innerHtml = doc.DocumentNode.InnerHtml; + + return innerHtml; + } + + /// + /// Converts HTML content to the Markdown format. + /// + /// The HTML content to parse. + /// The converted Markdown content. + public string ParseToMarkdown(string html) + { + var markdownConverter = new Converter(MARKDOWN_PARSER_CONFIG); + return markdownConverter.Convert(html); + } +} \ No newline at end of file diff --git a/app/MindWork AI Studio/packages.lock.json b/app/MindWork AI Studio/packages.lock.json index 5dd25e9e..220be608 100644 --- a/app/MindWork AI Studio/packages.lock.json +++ b/app/MindWork AI Studio/packages.lock.json @@ -2,6 +2,12 @@ "version": 1, "dependencies": { "net8.0": { + "HtmlAgilityPack": { + "type": "Direct", + "requested": "[1.11.62, )", + "resolved": "1.11.62", + "contentHash": "KS4h7ZjWsO6YixRfQgYdR+PZMbTaZod1LBPi+1Ph7dJCARzQm7DOKe5HPhP/o0EWX5l7QCgAZHa4VOa4pQa8JQ==" + }, "Microsoft.Extensions.FileProviders.Embedded": { "type": "Direct", "requested": "[8.0.7, )", @@ -38,6 +44,15 @@ "MudBlazor": "6.20.0" } }, + "ReverseMarkdown": { + "type": "Direct", + "requested": "[4.6.0, )", + "resolved": "4.6.0", + "contentHash": "ehNpMz3yQwd7P/vHpwi4KyDlT8UtVmtiL+NTb6mFEPzbLqJXbRIGF4OxEA5tuBA5Cfwhzf537TX1UIB6dUpo7A==", + "dependencies": { + "HtmlAgilityPack": "1.11.61" + } + }, "Markdig": { "type": "Transitive", "resolved": "0.37.0", @@ -163,6 +178,6 @@ "contentHash": "FHNOatmUq0sqJOkTx+UF/9YK1f180cnW5FVqnQMvYUN0elp6wFzbtPSiqbo1/ru8ICp43JM1i7kKkk6GsNGHlA==" } }, - "net8.0/osx-x64": {} + "net8.0/osx-arm64": {} } } \ No newline at end of file