diff --git a/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor b/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor
new file mode 100644
index 00000000..8b33a12e
--- /dev/null
+++ b/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor
@@ -0,0 +1,31 @@
+
+
+
+ @(this.showWebContentReader ? "Show web content options" : "Hide web content options")
+
+
+
+ @if (this.showWebContentReader)
+ {
+
+
+ @(this.useContentCleanerAgent ? "The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used." : "No content cleaning")
+
+
+
+
+
+ Fetch
+
+
+ @if (this.AgentIsRunning)
+ {
+
+
+
+
+
+
+ }
+ }
+
\ No newline at end of file
diff --git a/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor.cs b/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor.cs
new file mode 100644
index 00000000..56cbd60e
--- /dev/null
+++ b/app/MindWork AI Studio/Components/Blocks/ReadWebContent.razor.cs
@@ -0,0 +1,198 @@
+using AIStudio.Agents;
+using AIStudio.Chat;
+using AIStudio.Settings;
+using AIStudio.Tools;
+
+using Microsoft.AspNetCore.Components;
+
+namespace AIStudio.Components.Blocks;
+
+public partial class ReadWebContent : ComponentBase
+{
+ [Inject]
+ private HTMLParser HTMLParser { get; init; } = null!;
+
+ [Inject]
+ private AgentTextContentCleaner AgentTextContentCleaner { get; init; } = null!;
+
+ [Inject]
+ protected SettingsManager SettingsManager { get; set; } = null!;
+
+ [Inject]
+ protected IJSRuntime JsRuntime { get; init; } = null!;
+
+ [Parameter]
+ public string Content { get; set; } = string.Empty;
+
+ [Parameter]
+ public EventCallback ContentChanged { get; set; }
+
+ [Parameter]
+ public Settings.Provider ProviderSettings { get; set; }
+
+ [Parameter]
+ public bool AgentIsRunning { get; set; }
+
+ [Parameter]
+ public EventCallback AgentIsRunningChanged { get; set; }
+
+ [Parameter]
+ public bool Preselect { get; set; }
+
+ [Parameter]
+ public bool PreselectContentCleanerAgent { get; set; }
+
+ private Process process = Process.INSTANCE;
+ private ProcessStepValue processStep;
+
+ private bool showWebContentReader;
+ private bool useContentCleanerAgent;
+ private string providedURL = string.Empty;
+ private bool urlIsValid;
+ private bool isProviderValid;
+
+ private Settings.Provider providerSettings;
+
+ #region Overrides of ComponentBase
+
+ protected override async Task OnInitializedAsync()
+ {
+ if(this.Preselect)
+ this.showWebContentReader = true;
+
+ if(this.PreselectContentCleanerAgent)
+ this.useContentCleanerAgent = true;
+
+ if (this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions)
+ this.providerSettings = this.SettingsManager.ConfigurationData.Providers.FirstOrDefault(x => x.Id == this.SettingsManager.ConfigurationData.PreselectedAgentTextContentCleanerProvider);
+ else
+ this.providerSettings = this.ProviderSettings;
+
+ await base.OnInitializedAsync();
+ }
+
+ protected override async Task OnParametersSetAsync()
+ {
+ if (!this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions)
+ this.providerSettings = this.ProviderSettings;
+
+ await base.OnParametersSetAsync();
+ }
+
+ #endregion
+
+ private async Task LoadFromWeb()
+ {
+ if(!this.IsReady)
+ return;
+
+ var markdown = string.Empty;
+ try
+ {
+ this.processStep = this.process[ReadWebContentSteps.LOADING];
+ this.StateHasChanged();
+
+ var html = await this.HTMLParser.LoadWebContentHTML(new Uri(this.providedURL));
+
+ this.processStep = this.process[ReadWebContentSteps.PARSING];
+ this.StateHasChanged();
+ markdown = this.HTMLParser.ParseToMarkdown(html);
+
+ if (this.useContentCleanerAgent)
+ {
+ this.AgentTextContentCleaner.ProviderSettings = this.providerSettings;
+ var additionalData = new Dictionary
+ {
+ { "sourceURL", this.providedURL },
+ };
+
+ this.processStep = this.process[ReadWebContentSteps.CLEANING];
+ this.AgentIsRunning = true;
+ await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning);
+ this.StateHasChanged();
+
+ var contentBlock = await this.AgentTextContentCleaner.ProcessInput(new ContentBlock
+ {
+ Time = DateTimeOffset.UtcNow,
+ ContentType = ContentType.TEXT,
+ Role = ChatRole.USER,
+ Content = new ContentText
+ {
+ Text = markdown,
+ },
+ }, additionalData);
+
+ markdown = contentBlock.Content is ContentText text ? text.Text : markdown;
+
+ this.processStep = this.process[ReadWebContentSteps.DONE];
+ this.AgentIsRunning = false;
+ await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning);
+ this.StateHasChanged();
+ }
+ }
+ catch
+ {
+ if (this.AgentIsRunning)
+ {
+ this.processStep = this.process[ReadWebContentSteps.START];
+ this.AgentIsRunning = false;
+ await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning);
+ this.StateHasChanged();
+ }
+ }
+
+ this.Content = markdown;
+ await this.ContentChanged.InvokeAsync(this.Content);
+ }
+
+ private bool IsReady
+ {
+ get
+ {
+ if(!this.urlIsValid)
+ return false;
+
+ if(this.useContentCleanerAgent && !this.isProviderValid)
+ return false;
+
+ return true;
+ }
+ }
+
+ private string? ValidateProvider(bool shouldUseAgent)
+ {
+ if(shouldUseAgent && this.providerSettings == default)
+ {
+ this.isProviderValid = false;
+ return "Please select a provider to use the cleanup agent.";
+ }
+
+ this.isProviderValid = true;
+ return null;
+ }
+
+ private string? ValidateURL(string url)
+ {
+ if(string.IsNullOrWhiteSpace(url))
+ {
+ this.urlIsValid = false;
+ return "Please provide a URL to load the content from.";
+ }
+
+ var urlParsingResult = Uri.TryCreate(url, UriKind.Absolute, out var uriResult);
+ if(!urlParsingResult)
+ {
+ this.urlIsValid = false;
+ return "Please provide a valid URL.";
+ }
+
+ if(uriResult is not { Scheme: "http" or "https" })
+ {
+ this.urlIsValid = false;
+ return "Please provide a valid HTTP or HTTPS URL.";
+ }
+
+ this.urlIsValid = true;
+ return null;
+ }
+}
\ No newline at end of file
diff --git a/app/MindWork AI Studio/Components/Blocks/ReadWebContentSteps.cs b/app/MindWork AI Studio/Components/Blocks/ReadWebContentSteps.cs
new file mode 100644
index 00000000..3dafa015
--- /dev/null
+++ b/app/MindWork AI Studio/Components/Blocks/ReadWebContentSteps.cs
@@ -0,0 +1,10 @@
+namespace AIStudio.Components.Blocks;
+
+public enum ReadWebContentSteps
+{
+ START,
+ LOADING,
+ PARSING,
+ CLEANING,
+ DONE,
+}
diff --git a/app/MindWork AI Studio/Components/Pages/About.razor b/app/MindWork AI Studio/Components/Pages/About.razor
index 6aa02bab..0f29700f 100644
--- a/app/MindWork AI Studio/Components/Pages/About.razor
+++ b/app/MindWork AI Studio/Components/Pages/About.razor
@@ -47,6 +47,8 @@
+
+
diff --git a/app/MindWork AI Studio/Components/Pages/Settings.razor b/app/MindWork AI Studio/Components/Pages/Settings.razor
index 6551a722..138ab751 100644
--- a/app/MindWork AI Studio/Components/Pages/Settings.razor
+++ b/app/MindWork AI Studio/Components/Pages/Settings.razor
@@ -131,5 +131,17 @@
}
+
+ LLM Agent Options
+
+ Text Content Cleaner Agent
+
+
+ Use Case: this agent is used to clean up text content. It extracts the main content, removes advertisements and other irrelevant things,
+ and attempts to convert relative links into absolute links so that they can be used.
+
+
+
+
\ No newline at end of file
diff --git a/app/MindWork AI Studio/MindWork AI Studio.csproj b/app/MindWork AI Studio/MindWork AI Studio.csproj
index 9d6605a2..a24bc833 100644
--- a/app/MindWork AI Studio/MindWork AI Studio.csproj
+++ b/app/MindWork AI Studio/MindWork AI Studio.csproj
@@ -45,9 +45,11 @@
+
+
diff --git a/app/MindWork AI Studio/Program.cs b/app/MindWork AI Studio/Program.cs
index 6bd7e0a7..5bc114a2 100644
--- a/app/MindWork AI Studio/Program.cs
+++ b/app/MindWork AI Studio/Program.cs
@@ -31,6 +31,7 @@ builder.Services.AddSingleton();
builder.Services.AddMudMarkdownClipboardService();
builder.Services.AddSingleton();
builder.Services.AddSingleton();
+builder.Services.AddTransient();
builder.Services.AddTransient();
builder.Services.AddHostedService();
builder.Services.AddHostedService();
diff --git a/app/MindWork AI Studio/Settings/DataModel/Data.cs b/app/MindWork AI Studio/Settings/DataModel/Data.cs
index 26639b6a..12ff3533 100644
--- a/app/MindWork AI Studio/Settings/DataModel/Data.cs
+++ b/app/MindWork AI Studio/Settings/DataModel/Data.cs
@@ -199,4 +199,18 @@ public sealed class Data
public string PreselectedTextSummarizerProvider { get; set; } = string.Empty;
#endregion
+
+ #region Agent: Text Content Cleaner Settings
+
+ ///
+ /// Preselect any text content cleaner options?
+ ///
+ public bool PreselectAgentTextContentCleanerOptions { get; set; }
+
+ ///
+ /// Preselect a text content cleaner provider?
+ ///
+ public string PreselectedAgentTextContentCleanerProvider { get; set; } = string.Empty;
+
+ #endregion
}
\ No newline at end of file
diff --git a/app/MindWork AI Studio/Tools/HTMLParser.cs b/app/MindWork AI Studio/Tools/HTMLParser.cs
new file mode 100644
index 00000000..4f9dca2a
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/HTMLParser.cs
@@ -0,0 +1,57 @@
+using System.Net;
+using System.Text;
+
+using HtmlAgilityPack;
+
+using ReverseMarkdown;
+
+namespace AIStudio.Tools;
+
+public sealed class HTMLParser
+{
+ private static readonly Config MARKDOWN_PARSER_CONFIG = new()
+ {
+ UnknownTags = Config.UnknownTagsOption.Bypass,
+ RemoveComments = true,
+ SmartHrefHandling = true
+ };
+
+ ///
+ /// Loads the web content from the specified URL.
+ ///
+ /// The URL of the web page.
+ /// The web content as text.
+ public async Task LoadWebContentText(Uri url)
+ {
+ var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
+ var parser = new HtmlWeb();
+ var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token);
+ return doc.ParsedText;
+ }
+
+ ///
+ /// Loads the web content from the specified URL and returns it as an HTML string.
+ ///
+ /// The URL of the web page.
+ /// The web content as an HTML string.
+ public async Task LoadWebContentHTML(Uri url)
+ {
+ var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
+ var parser = new HtmlWeb();
+ var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token);
+ var innerHtml = doc.DocumentNode.InnerHtml;
+
+ return innerHtml;
+ }
+
+ ///
+ /// Converts HTML content to the Markdown format.
+ ///
+ /// The HTML content to parse.
+ /// The converted Markdown content.
+ public string ParseToMarkdown(string html)
+ {
+ var markdownConverter = new Converter(MARKDOWN_PARSER_CONFIG);
+ return markdownConverter.Convert(html);
+ }
+}
\ No newline at end of file
diff --git a/app/MindWork AI Studio/packages.lock.json b/app/MindWork AI Studio/packages.lock.json
index 5dd25e9e..220be608 100644
--- a/app/MindWork AI Studio/packages.lock.json
+++ b/app/MindWork AI Studio/packages.lock.json
@@ -2,6 +2,12 @@
"version": 1,
"dependencies": {
"net8.0": {
+ "HtmlAgilityPack": {
+ "type": "Direct",
+ "requested": "[1.11.62, )",
+ "resolved": "1.11.62",
+ "contentHash": "KS4h7ZjWsO6YixRfQgYdR+PZMbTaZod1LBPi+1Ph7dJCARzQm7DOKe5HPhP/o0EWX5l7QCgAZHa4VOa4pQa8JQ=="
+ },
"Microsoft.Extensions.FileProviders.Embedded": {
"type": "Direct",
"requested": "[8.0.7, )",
@@ -38,6 +44,15 @@
"MudBlazor": "6.20.0"
}
},
+ "ReverseMarkdown": {
+ "type": "Direct",
+ "requested": "[4.6.0, )",
+ "resolved": "4.6.0",
+ "contentHash": "ehNpMz3yQwd7P/vHpwi4KyDlT8UtVmtiL+NTb6mFEPzbLqJXbRIGF4OxEA5tuBA5Cfwhzf537TX1UIB6dUpo7A==",
+ "dependencies": {
+ "HtmlAgilityPack": "1.11.61"
+ }
+ },
"Markdig": {
"type": "Transitive",
"resolved": "0.37.0",
@@ -163,6 +178,6 @@
"contentHash": "FHNOatmUq0sqJOkTx+UF/9YK1f180cnW5FVqnQMvYUN0elp6wFzbtPSiqbo1/ru8ICp43JM1i7kKkk6GsNGHlA=="
}
},
- "net8.0/osx-x64": {}
+ "net8.0/osx-arm64": {}
}
}
\ No newline at end of file