Implemented web content reader

This commit is contained in:
Thorsten Sommer 2024-08-01 21:34:07 +02:00
parent c8d918408f
commit 5fe24fe861
Signed by: tsommer
GPG Key ID: 371BBA77A02C0108
10 changed files with 343 additions and 1 deletions

View File

@ -0,0 +1,31 @@
<MudPaper Class="pa-3 mb-8 border-dashed border rounded-lg">
<MudField Label="Read content from web?" Variant="Variant.Outlined" Class="mb-3" Disabled="@this.AgentIsRunning">
<MudSwitch T="bool" @bind-Value="@this.showWebContentReader" Color="Color.Primary" Disabled="@this.AgentIsRunning">
@(this.showWebContentReader ? "Show web content options" : "Hide web content options")
</MudSwitch>
</MudField>
@if (this.showWebContentReader)
{
<MudField Label="Cleanup content by using a LLM agent?" Variant="Variant.Outlined" Class="mb-3" Disabled="@this.AgentIsRunning">
<MudSwitch T="bool" @bind-Value="@this.useContentCleanerAgent" Color="Color.Primary" Validation="@this.ValidateProvider" Disabled="@this.AgentIsRunning">
@(this.useContentCleanerAgent ? "The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used." : "No content cleaning")
</MudSwitch>
</MudField>
<MudStack Row="@true" AlignItems="@AlignItems.Baseline" Class="mb-3">
<MudTextField T="string" Label="URL from which to load the content" @bind-Value="@this.providedURL" Validation="@this.ValidateURL" Adornment="Adornment.Start" AdornmentIcon="@Icons.Material.Filled.Link" Placeholder="https://..." HelperText="Loads the content from your URL. Does not work when the content is hidden behind a paywall." Variant="Variant.Outlined" Immediate="@true" Disabled="@this.AgentIsRunning"/>
<MudButton Disabled="@(!this.IsReady || this.AgentIsRunning)" Variant="Variant.Filled" Size="Size.Large" Color="Color.Primary" StartIcon="@Icons.Material.Filled.Download" OnClick="() => this.LoadFromWeb()">
Fetch
</MudButton>
</MudStack>
@if (this.AgentIsRunning)
{
<div class="pa-1">
<MudProgressLinear Color="Color.Primary" Indeterminate="true" Class=""/>
</div>
<div class="pa-6">
<MudSlider T="int" Disabled="@true" Value="@this.processStep" Min="@this.process.Min" Max="@this.process.Max" TickMarks="@true" Size="Size.Large" Variant="Variant.Filled" TickMarkLabels="@this.process.Labels" Class="mb-12"/>
</div>
}
}
</MudPaper>

View File

@ -0,0 +1,198 @@
using AIStudio.Agents;
using AIStudio.Chat;
using AIStudio.Settings;
using AIStudio.Tools;
using Microsoft.AspNetCore.Components;
namespace AIStudio.Components.Blocks;
public partial class ReadWebContent : ComponentBase
{
[Inject]
private HTMLParser HTMLParser { get; init; } = null!;
[Inject]
private AgentTextContentCleaner AgentTextContentCleaner { get; init; } = null!;
[Inject]
protected SettingsManager SettingsManager { get; set; } = null!;
[Inject]
protected IJSRuntime JsRuntime { get; init; } = null!;
[Parameter]
public string Content { get; set; } = string.Empty;
[Parameter]
public EventCallback<string> ContentChanged { get; set; }
[Parameter]
public Settings.Provider ProviderSettings { get; set; }
[Parameter]
public bool AgentIsRunning { get; set; }
[Parameter]
public EventCallback<bool> AgentIsRunningChanged { get; set; }
[Parameter]
public bool Preselect { get; set; }
[Parameter]
public bool PreselectContentCleanerAgent { get; set; }
private Process<ReadWebContentSteps> process = Process<ReadWebContentSteps>.INSTANCE;
private ProcessStepValue processStep;
private bool showWebContentReader;
private bool useContentCleanerAgent;
private string providedURL = string.Empty;
private bool urlIsValid;
private bool isProviderValid;
private Settings.Provider providerSettings;
#region Overrides of ComponentBase
protected override async Task OnInitializedAsync()
{
if(this.Preselect)
this.showWebContentReader = true;
if(this.PreselectContentCleanerAgent)
this.useContentCleanerAgent = true;
if (this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions)
this.providerSettings = this.SettingsManager.ConfigurationData.Providers.FirstOrDefault(x => x.Id == this.SettingsManager.ConfigurationData.PreselectedAgentTextContentCleanerProvider);
else
this.providerSettings = this.ProviderSettings;
await base.OnInitializedAsync();
}
protected override async Task OnParametersSetAsync()
{
if (!this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions)
this.providerSettings = this.ProviderSettings;
await base.OnParametersSetAsync();
}
#endregion
private async Task LoadFromWeb()
{
if(!this.IsReady)
return;
var markdown = string.Empty;
try
{
this.processStep = this.process[ReadWebContentSteps.LOADING];
this.StateHasChanged();
var html = await this.HTMLParser.LoadWebContentHTML(new Uri(this.providedURL));
this.processStep = this.process[ReadWebContentSteps.PARSING];
this.StateHasChanged();
markdown = this.HTMLParser.ParseToMarkdown(html);
if (this.useContentCleanerAgent)
{
this.AgentTextContentCleaner.ProviderSettings = this.providerSettings;
var additionalData = new Dictionary<string, string>
{
{ "sourceURL", this.providedURL },
};
this.processStep = this.process[ReadWebContentSteps.CLEANING];
this.AgentIsRunning = true;
await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning);
this.StateHasChanged();
var contentBlock = await this.AgentTextContentCleaner.ProcessInput(new ContentBlock
{
Time = DateTimeOffset.UtcNow,
ContentType = ContentType.TEXT,
Role = ChatRole.USER,
Content = new ContentText
{
Text = markdown,
},
}, additionalData);
markdown = contentBlock.Content is ContentText text ? text.Text : markdown;
this.processStep = this.process[ReadWebContentSteps.DONE];
this.AgentIsRunning = false;
await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning);
this.StateHasChanged();
}
}
catch
{
if (this.AgentIsRunning)
{
this.processStep = this.process[ReadWebContentSteps.START];
this.AgentIsRunning = false;
await this.AgentIsRunningChanged.InvokeAsync(this.AgentIsRunning);
this.StateHasChanged();
}
}
this.Content = markdown;
await this.ContentChanged.InvokeAsync(this.Content);
}
private bool IsReady
{
get
{
if(!this.urlIsValid)
return false;
if(this.useContentCleanerAgent && !this.isProviderValid)
return false;
return true;
}
}
private string? ValidateProvider(bool shouldUseAgent)
{
if(shouldUseAgent && this.providerSettings == default)
{
this.isProviderValid = false;
return "Please select a provider to use the cleanup agent.";
}
this.isProviderValid = true;
return null;
}
private string? ValidateURL(string url)
{
if(string.IsNullOrWhiteSpace(url))
{
this.urlIsValid = false;
return "Please provide a URL to load the content from.";
}
var urlParsingResult = Uri.TryCreate(url, UriKind.Absolute, out var uriResult);
if(!urlParsingResult)
{
this.urlIsValid = false;
return "Please provide a valid URL.";
}
if(uriResult is not { Scheme: "http" or "https" })
{
this.urlIsValid = false;
return "Please provide a valid HTTP or HTTPS URL.";
}
this.urlIsValid = true;
return null;
}
}

View File

@ -0,0 +1,10 @@
namespace AIStudio.Components.Blocks;
public enum ReadWebContentSteps
{
START,
LOADING,
PARSING,
CLEANING,
DONE,
}

View File

@ -47,6 +47,8 @@
<ThirdPartyComponent Name="arboard" Developer="Artur Kovacs, Avi Weinstock, 1Password & Open Source Community" LicenseName="MIT & Apache-2.0" LicenseUrl="https://github.com/1Password/arboard" RepositoryUrl="https://github.com/1Password/arboard" UseCase="To be able to use the responses of the LLM in other apps, we often use the clipboard of the respective operating system. Unfortunately, in .NET there is no solution that works with all operating systems. Therefore, I have opted for this library in Rust. This way, data transfer to other apps works on every system."/>
<ThirdPartyComponent Name="tokio" Developer="Alex Crichton & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/tokio-rs/tokio/blob/master/LICENSE" RepositoryUrl="https://github.com/tokio-rs/tokio" UseCase="Code in the Rust language can be specified as synchronous or asynchronous. Unlike .NET and the C# language, Rust cannot execute asynchronous code by itself. Rust requires support in the form of an executor for this. Tokio is one such executor."/>
<ThirdPartyComponent Name="flexi_logger" Developer="emabee & Open Source Community" LicenseName="MIT & Apache-2.0" LicenseUrl="https://github.com/emabee/flexi_logger" RepositoryUrl="https://github.com/emabee/flexi_logger" UseCase="This Rust library is used to output the app's messages to the terminal. This is helpful during development and troubleshooting. This feature is initially invisible; when the app is started via the terminal, the messages become visible."/>
<ThirdPartyComponent Name="HtmlAgilityPack" Developer="ZZZ Projects & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/zzzprojects/html-agility-pack/blob/master/LICENSE" RepositoryUrl="https://github.com/zzzprojects/html-agility-pack" UseCase="We use the HtmlAgilityPack to extract content from the web. This is necessary, e.g., when you provide a URL as input for an assistant."/>
<ThirdPartyComponent Name="ReverseMarkdown" Developer="Babu Annamalai & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/mysticmind/reversemarkdown-net/blob/master/LICENSE" RepositoryUrl="https://github.com/mysticmind/reversemarkdown-net" UseCase="This library is used to convert HTML to Markdown. This is necessary, e.g., when you provide a URL as input for an assistant."/>
</MudGrid>
</ExpansionPanel>
<ExpansionPanel HeaderIcon="@Icons.Material.Filled.Verified" HeaderText="License: FSL-1.1-MIT">

View File

@ -131,5 +131,17 @@
}
<ConfigurationProviderSelection Data="@this.availableProviders" Disabled="@(() => !this.SettingsManager.ConfigurationData.PreselectTextSummarizerOptions)" SelectedValue="@(() => this.SettingsManager.ConfigurationData.PreselectedTextSummarizerProvider)" SelectionUpdate="@(selectedValue => this.SettingsManager.ConfigurationData.PreselectedTextSummarizerProvider = selectedValue)"/>
</MudPaper>
<MudText Typo="Typo.h4" Class="mb-3">LLM Agent Options</MudText>
<MudText Typo="Typo.h5" Class="mb-3">Text Content Cleaner Agent</MudText>
<MudPaper Class="pa-3 mb-8 border-dashed border rounded-lg">
<MudText Typo="Typo.body1" Class="mb-3">
Use Case: this agent is used to clean up text content. It extracts the main content, removes advertisements and other irrelevant things,
and attempts to convert relative links into absolute links so that they can be used.
</MudText>
<ConfigurationOption OptionDescription="Preselect text content cleaner options?" LabelOn="Options are preselected" LabelOff="No options are preselected" State="@(() => this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions)" StateUpdate="@(updatedState => this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions = updatedState)" OptionHelp="When enabled, you can preselect some agent options. This is might be useful when you prefer a LLM."/>
<ConfigurationProviderSelection Data="@this.availableProviders" Disabled="@(() => !this.SettingsManager.ConfigurationData.PreselectAgentTextContentCleanerOptions)" SelectedValue="@(() => this.SettingsManager.ConfigurationData.PreselectedAgentTextContentCleanerProvider)" SelectionUpdate="@(selectedValue => this.SettingsManager.ConfigurationData.PreselectedAgentTextContentCleanerProvider = selectedValue)"/>
</MudPaper>
</MudPaper>
</InnerScrolling>

View File

@ -45,9 +45,11 @@
</ItemGroup>
<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.11.62" />
<PackageReference Include="Microsoft.Extensions.FileProviders.Embedded" Version="8.0.7" />
<PackageReference Include="MudBlazor" Version="7.4.0" />
<PackageReference Include="MudBlazor.Markdown" Version="1.0.2" />
<PackageReference Include="ReverseMarkdown" Version="4.6.0" />
</ItemGroup>
<!-- Read the meta data file -->

View File

@ -31,6 +31,7 @@ builder.Services.AddSingleton<Rust>();
builder.Services.AddMudMarkdownClipboardService<MarkdownClipboardService>();
builder.Services.AddSingleton<SettingsManager>();
builder.Services.AddSingleton<ThreadSafeRandom>();
builder.Services.AddTransient<HTMLParser>();
builder.Services.AddTransient<AgentTextContentCleaner>();
builder.Services.AddHostedService<UpdateService>();
builder.Services.AddHostedService<TemporaryChatService>();

View File

@ -199,4 +199,18 @@ public sealed class Data
public string PreselectedTextSummarizerProvider { get; set; } = string.Empty;
#endregion
#region Agent: Text Content Cleaner Settings
/// <summary>
/// Preselect any text content cleaner options?
/// </summary>
public bool PreselectAgentTextContentCleanerOptions { get; set; }
/// <summary>
/// Preselect a text content cleaner provider?
/// </summary>
public string PreselectedAgentTextContentCleanerProvider { get; set; } = string.Empty;
#endregion
}

View File

@ -0,0 +1,57 @@
using System.Net;
using System.Text;
using HtmlAgilityPack;
using ReverseMarkdown;
namespace AIStudio.Tools;
public sealed class HTMLParser
{
private static readonly Config MARKDOWN_PARSER_CONFIG = new()
{
UnknownTags = Config.UnknownTagsOption.Bypass,
RemoveComments = true,
SmartHrefHandling = true
};
/// <summary>
/// Loads the web content from the specified URL.
/// </summary>
/// <param name="url">The URL of the web page.</param>
/// <returns>The web content as text.</returns>
public async Task<string> LoadWebContentText(Uri url)
{
var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
var parser = new HtmlWeb();
var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token);
return doc.ParsedText;
}
/// <summary>
/// Loads the web content from the specified URL and returns it as an HTML string.
/// </summary>
/// <param name="url">The URL of the web page.</param>
/// <returns>The web content as an HTML string.</returns>
public async Task<string> LoadWebContentHTML(Uri url)
{
var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
var parser = new HtmlWeb();
var doc = await parser.LoadFromWebAsync(url, Encoding.UTF8, new NetworkCredential(), cts.Token);
var innerHtml = doc.DocumentNode.InnerHtml;
return innerHtml;
}
/// <summary>
/// Converts HTML content to the Markdown format.
/// </summary>
/// <param name="html">The HTML content to parse.</param>
/// <returns>The converted Markdown content.</returns>
public string ParseToMarkdown(string html)
{
var markdownConverter = new Converter(MARKDOWN_PARSER_CONFIG);
return markdownConverter.Convert(html);
}
}

View File

@ -2,6 +2,12 @@
"version": 1,
"dependencies": {
"net8.0": {
"HtmlAgilityPack": {
"type": "Direct",
"requested": "[1.11.62, )",
"resolved": "1.11.62",
"contentHash": "KS4h7ZjWsO6YixRfQgYdR+PZMbTaZod1LBPi+1Ph7dJCARzQm7DOKe5HPhP/o0EWX5l7QCgAZHa4VOa4pQa8JQ=="
},
"Microsoft.Extensions.FileProviders.Embedded": {
"type": "Direct",
"requested": "[8.0.7, )",
@ -38,6 +44,15 @@
"MudBlazor": "6.20.0"
}
},
"ReverseMarkdown": {
"type": "Direct",
"requested": "[4.6.0, )",
"resolved": "4.6.0",
"contentHash": "ehNpMz3yQwd7P/vHpwi4KyDlT8UtVmtiL+NTb6mFEPzbLqJXbRIGF4OxEA5tuBA5Cfwhzf537TX1UIB6dUpo7A==",
"dependencies": {
"HtmlAgilityPack": "1.11.61"
}
},
"Markdig": {
"type": "Transitive",
"resolved": "0.37.0",
@ -163,6 +178,6 @@
"contentHash": "FHNOatmUq0sqJOkTx+UF/9YK1f180cnW5FVqnQMvYUN0elp6wFzbtPSiqbo1/ru8ICp43JM1i7kKkk6GsNGHlA=="
}
},
"net8.0/osx-x64": {}
"net8.0/osx-arm64": {}
}
}