mirror of
https://github.com/MindWorkAI/AI-Studio.git
synced 2025-07-04 00:22:56 +00:00
Allow loading any document in some assistants (#508)
Co-authored-by: krut_ni <nils.kruthoff@dlr.de> Co-authored-by: Thorsten Sommer <mail@tsommer.org>
This commit is contained in:
parent
7df0b3e6e0
commit
aaedf667fe
@ -1648,11 +1648,17 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROFILESELECTION::T918741365"] = "You can
|
|||||||
-- Provider
|
-- Provider
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROVIDERSELECTION::T900237532"] = "Provider"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROVIDERSELECTION::T900237532"] = "Provider"
|
||||||
|
|
||||||
-- Use PDF content as input
|
-- Images are not supported yet
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READPDFCONTENT::T2849276709"] = "Use PDF content as input"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T298062956"] = "Images are not supported yet"
|
||||||
|
|
||||||
-- Select PDF file
|
-- Use file content as input
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READPDFCONTENT::T63272795"] = "Select PDF file"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T3499386973"] = "Use file content as input"
|
||||||
|
|
||||||
|
-- Select file to read its content
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T354817589"] = "Select file to read its content"
|
||||||
|
|
||||||
|
-- Executables are not allowed
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T4167762413"] = "Executables are not allowed"
|
||||||
|
|
||||||
-- The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used.
|
-- The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used.
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READWEBCONTENT::T1164201762"] = "The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used."
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READWEBCONTENT::T1164201762"] = "The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used."
|
||||||
@ -5482,6 +5488,9 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T304
|
|||||||
-- AI source selection with AI retrieval context validation
|
-- AI source selection with AI retrieval context validation
|
||||||
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "AI source selection with AI retrieval context validation"
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "AI source selection with AI retrieval context validation"
|
||||||
|
|
||||||
|
-- Executable Files
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2217313358"] = "Executable Files"
|
||||||
|
|
||||||
-- PDF Files
|
-- PDF Files
|
||||||
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF Files"
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF Files"
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
<ReadWebContent @bind-Content="@this.inputLegalDocument" ProviderSettings="@this.providerSettings" @bind-AgentIsRunning="@this.isAgentRunning" Preselect="@(this.SettingsManager.ConfigurationData.LegalCheck.PreselectOptions && this.SettingsManager.ConfigurationData.LegalCheck.PreselectWebContentReader)" PreselectContentCleanerAgent="@(this.SettingsManager.ConfigurationData.LegalCheck.PreselectOptions && this.SettingsManager.ConfigurationData.LegalCheck.PreselectContentCleanerAgent)"/>
|
<ReadWebContent @bind-Content="@this.inputLegalDocument" ProviderSettings="@this.providerSettings" @bind-AgentIsRunning="@this.isAgentRunning" Preselect="@(this.SettingsManager.ConfigurationData.LegalCheck.PreselectOptions && this.SettingsManager.ConfigurationData.LegalCheck.PreselectWebContentReader)" PreselectContentCleanerAgent="@(this.SettingsManager.ConfigurationData.LegalCheck.PreselectOptions && this.SettingsManager.ConfigurationData.LegalCheck.PreselectContentCleanerAgent)"/>
|
||||||
}
|
}
|
||||||
|
|
||||||
<ReadPDFContent @bind-PDFContent="@this.inputLegalDocument"/>
|
<ReadFileContent @bind-FileContent="@this.inputLegalDocument"/>
|
||||||
<MudTextField T="string" Disabled="@this.isAgentRunning" @bind-Text="@this.inputLegalDocument" Validation="@this.ValidatingLegalDocument" AdornmentIcon="@Icons.Material.Filled.DocumentScanner" Adornment="Adornment.Start" Label="@T("Legal document")" Variant="Variant.Outlined" Lines="12" AutoGrow="@true" MaxLines="24" Class="mb-3" UserAttributes="@USER_INPUT_ATTRIBUTES"/>
|
<MudTextField T="string" Disabled="@this.isAgentRunning" @bind-Text="@this.inputLegalDocument" Validation="@this.ValidatingLegalDocument" AdornmentIcon="@Icons.Material.Filled.DocumentScanner" Adornment="Adornment.Start" Label="@T("Legal document")" Variant="Variant.Outlined" Lines="12" AutoGrow="@true" MaxLines="24" Class="mb-3" UserAttributes="@USER_INPUT_ATTRIBUTES"/>
|
||||||
<MudTextField T="string" Disabled="@this.isAgentRunning" @bind-Text="@this.inputQuestions" Validation="@this.ValidatingQuestions" AdornmentIcon="@Icons.Material.Filled.QuestionAnswer" Adornment="Adornment.Start" Label="@T("Your questions")" Variant="Variant.Outlined" Lines="6" AutoGrow="@true" MaxLines="12" Class="mb-3" UserAttributes="@USER_INPUT_ATTRIBUTES"/>
|
<MudTextField T="string" Disabled="@this.isAgentRunning" @bind-Text="@this.inputQuestions" Validation="@this.ValidatingQuestions" AdornmentIcon="@Icons.Material.Filled.QuestionAnswer" Adornment="Adornment.Start" Label="@T("Your questions")" Variant="Variant.Outlined" Lines="6" AutoGrow="@true" MaxLines="12" Class="mb-3" UserAttributes="@USER_INPUT_ATTRIBUTES"/>
|
||||||
<ProviderSelection @bind-ProviderSettings="@this.providerSettings" ValidateProvider="@this.ValidatingProvider"/>
|
<ProviderSelection @bind-ProviderSettings="@this.providerSettings" ValidateProvider="@this.ValidatingProvider"/>
|
@ -6,7 +6,7 @@
|
|||||||
<ReadWebContent @bind-Content="@this.inputText" ProviderSettings="@this.providerSettings" @bind-AgentIsRunning="@this.isAgentRunning" Preselect="@(this.SettingsManager.ConfigurationData.TextSummarizer.PreselectOptions && this.SettingsManager.ConfigurationData.TextSummarizer.PreselectWebContentReader)" PreselectContentCleanerAgent="@(this.SettingsManager.ConfigurationData.TextSummarizer.PreselectOptions && this.SettingsManager.ConfigurationData.TextSummarizer.PreselectContentCleanerAgent)"/>
|
<ReadWebContent @bind-Content="@this.inputText" ProviderSettings="@this.providerSettings" @bind-AgentIsRunning="@this.isAgentRunning" Preselect="@(this.SettingsManager.ConfigurationData.TextSummarizer.PreselectOptions && this.SettingsManager.ConfigurationData.TextSummarizer.PreselectWebContentReader)" PreselectContentCleanerAgent="@(this.SettingsManager.ConfigurationData.TextSummarizer.PreselectOptions && this.SettingsManager.ConfigurationData.TextSummarizer.PreselectContentCleanerAgent)"/>
|
||||||
}
|
}
|
||||||
|
|
||||||
<ReadPDFContent @bind-PDFContent="@this.inputText"/>
|
<ReadFileContent @bind-FileContent="@this.inputText"/>
|
||||||
<MudTextField T="string" Disabled="@this.isAgentRunning" @bind-Text="@this.inputText" Validation="@this.ValidatingText" AdornmentIcon="@Icons.Material.Filled.DocumentScanner" Adornment="Adornment.Start" Label="@T("Your input")" Variant="Variant.Outlined" Lines="6" AutoGrow="@true" MaxLines="12" Class="mb-3" UserAttributes="@USER_INPUT_ATTRIBUTES"/>
|
<MudTextField T="string" Disabled="@this.isAgentRunning" @bind-Text="@this.inputText" Validation="@this.ValidatingText" AdornmentIcon="@Icons.Material.Filled.DocumentScanner" Adornment="Adornment.Start" Label="@T("Your input")" Variant="Variant.Outlined" Lines="6" AutoGrow="@true" MaxLines="12" Class="mb-3" UserAttributes="@USER_INPUT_ATTRIBUTES"/>
|
||||||
<EnumSelection T="CommonLanguages" NameFunc="@(language => language.Name())" @bind-Value="@this.selectedTargetLanguage" Icon="@Icons.Material.Filled.Translate" Label="@T("Target language")" AllowOther="@true" @bind-OtherInput="@this.customTargetLanguage" OtherValue="CommonLanguages.OTHER" LabelOther="@T("Custom target language")" ValidateOther="@this.ValidateCustomLanguage" />
|
<EnumSelection T="CommonLanguages" NameFunc="@(language => language.Name())" @bind-Value="@this.selectedTargetLanguage" Icon="@Icons.Material.Filled.Translate" Label="@T("Target language")" AllowOther="@true" @bind-OtherInput="@this.customTargetLanguage" OtherValue="CommonLanguages.OTHER" LabelOther="@T("Custom target language")" ValidateOther="@this.ValidateCustomLanguage" />
|
||||||
<EnumSelection T="Complexity" NameFunc="@(complexity => complexity.Name())" @bind-Value="@this.selectedComplexity" Icon="@Icons.Material.Filled.Layers" Label="@T("Target complexity")" AllowOther="@true" @bind-OtherInput="@this.expertInField" OtherValue="Complexity.SCIENTIFIC_LANGUAGE_OTHER_EXPERTS" LabelOther="@T("Your expertise")" ValidateOther="@this.ValidateExpertInField" />
|
<EnumSelection T="Complexity" NameFunc="@(complexity => complexity.Name())" @bind-Value="@this.selectedComplexity" Icon="@Icons.Material.Filled.Layers" Label="@T("Target complexity")" AllowOther="@true" @bind-OtherInput="@this.expertInField" OtherValue="Complexity.SCIENTIFIC_LANGUAGE_OTHER_EXPERTS" LabelOther="@T("Your expertise")" ValidateOther="@this.ValidateExpertInField" />
|
||||||
|
@ -6,7 +6,8 @@
|
|||||||
<ReadWebContent @bind-Content="@this.inputText" ProviderSettings="@this.providerSettings" @bind-AgentIsRunning="@this.isAgentRunning" Preselect="@(this.SettingsManager.ConfigurationData.Translation.PreselectOptions && this.SettingsManager.ConfigurationData.Translation.PreselectWebContentReader)" PreselectContentCleanerAgent="@(this.SettingsManager.ConfigurationData.Translation.PreselectOptions && this.SettingsManager.ConfigurationData.Translation.PreselectContentCleanerAgent)"/>
|
<ReadWebContent @bind-Content="@this.inputText" ProviderSettings="@this.providerSettings" @bind-AgentIsRunning="@this.isAgentRunning" Preselect="@(this.SettingsManager.ConfigurationData.Translation.PreselectOptions && this.SettingsManager.ConfigurationData.Translation.PreselectWebContentReader)" PreselectContentCleanerAgent="@(this.SettingsManager.ConfigurationData.Translation.PreselectOptions && this.SettingsManager.ConfigurationData.Translation.PreselectContentCleanerAgent)"/>
|
||||||
}
|
}
|
||||||
|
|
||||||
<ReadPDFContent @bind-PDFContent="@this.inputText"/>
|
<ReadFileContent @bind-FileContent="@this.inputText"/>
|
||||||
|
|
||||||
<MudTextSwitch Label="@T("Live translation")" @bind-Value="@this.liveTranslation" LabelOn="@T("Live translation")" LabelOff="@T("No live translation")"/>
|
<MudTextSwitch Label="@T("Live translation")" @bind-Value="@this.liveTranslation" LabelOn="@T("Live translation")" LabelOff="@T("No live translation")"/>
|
||||||
@if (this.liveTranslation)
|
@if (this.liveTranslation)
|
||||||
{
|
{
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
@inherits MSGComponentBase
|
@inherits MSGComponentBase
|
||||||
<MudButton StartIcon="@Icons.Material.Filled.Description" OnClick="async () => await this.SelectFile()" Variant="Variant.Filled" Class="mb-3">
|
<MudButton StartIcon="@Icons.Material.Filled.Description" OnClick="async () => await this.SelectFile()" Variant="Variant.Filled" Class="mb-3">
|
||||||
@T("Use PDF content as input")
|
@T("Use file content as input")
|
||||||
</MudButton>
|
</MudButton>
|
45
app/MindWork AI Studio/Components/ReadFileContent.razor.cs
Normal file
45
app/MindWork AI Studio/Components/ReadFileContent.razor.cs
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
using AIStudio.Tools.Rust;
|
||||||
|
using AIStudio.Tools.Services;
|
||||||
|
|
||||||
|
using Microsoft.AspNetCore.Components;
|
||||||
|
|
||||||
|
namespace AIStudio.Components;
|
||||||
|
|
||||||
|
public partial class ReadFileContent : MSGComponentBase
|
||||||
|
{
|
||||||
|
[Parameter]
|
||||||
|
public string FileContent { get; set; } = string.Empty;
|
||||||
|
|
||||||
|
[Parameter]
|
||||||
|
public EventCallback<string> FileContentChanged { get; set; }
|
||||||
|
|
||||||
|
[Inject]
|
||||||
|
private RustService RustService { get; init; } = null!;
|
||||||
|
|
||||||
|
private async Task SelectFile()
|
||||||
|
{
|
||||||
|
var selectedFile = await this.RustService.SelectFile(T("Select file to read its content"));
|
||||||
|
if (selectedFile.UserCancelled)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if(!File.Exists(selectedFile.SelectedFilePath))
|
||||||
|
return;
|
||||||
|
|
||||||
|
var ext = Path.GetExtension(selectedFile.SelectedFilePath).TrimStart('.');
|
||||||
|
if (Array.Exists(FileTypeFilter.Executables.FilterExtensions, x => x.Equals(ext, StringComparison.OrdinalIgnoreCase)))
|
||||||
|
{
|
||||||
|
await MessageBus.INSTANCE.SendError(new(Icons.Material.Filled.AppBlocking, T("Executables are not allowed")));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Array.Exists(FileTypeFilter.AllImages.FilterExtensions, x => x.Equals(ext, StringComparison.OrdinalIgnoreCase)))
|
||||||
|
{
|
||||||
|
await MessageBus.INSTANCE.SendWarning(new(Icons.Material.Filled.ImageNotSupported, T("Images are not supported yet")));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var streamId = Guid.NewGuid().ToString();
|
||||||
|
var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, streamId, int.MaxValue);
|
||||||
|
await this.FileContentChanged.InvokeAsync(fileContent);
|
||||||
|
}
|
||||||
|
}
|
@ -1,31 +0,0 @@
|
|||||||
using AIStudio.Tools.Rust;
|
|
||||||
using AIStudio.Tools.Services;
|
|
||||||
|
|
||||||
using Microsoft.AspNetCore.Components;
|
|
||||||
|
|
||||||
namespace AIStudio.Components;
|
|
||||||
|
|
||||||
public partial class ReadPDFContent : MSGComponentBase
|
|
||||||
{
|
|
||||||
[Parameter]
|
|
||||||
public string PDFContent { get; set; } = string.Empty;
|
|
||||||
|
|
||||||
[Parameter]
|
|
||||||
public EventCallback<string> PDFContentChanged { get; set; }
|
|
||||||
|
|
||||||
[Inject]
|
|
||||||
private RustService RustService { get; init; } = null!;
|
|
||||||
|
|
||||||
private async Task SelectFile()
|
|
||||||
{
|
|
||||||
var pdfFile = await this.RustService.SelectFile(T("Select PDF file"), FileTypeFilter.PDF);
|
|
||||||
if (pdfFile.UserCancelled)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if(!File.Exists(pdfFile.SelectedFilePath))
|
|
||||||
return;
|
|
||||||
|
|
||||||
var pdfText = await this.RustService.GetPDFText(pdfFile.SelectedFilePath);
|
|
||||||
await this.PDFContentChanged.InvokeAsync(pdfText);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1650,11 +1650,17 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROFILESELECTION::T918741365"] = "Hier k
|
|||||||
-- Provider
|
-- Provider
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROVIDERSELECTION::T900237532"] = "Anbieter"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROVIDERSELECTION::T900237532"] = "Anbieter"
|
||||||
|
|
||||||
-- Use PDF content as input
|
-- Images are not supported yet
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READPDFCONTENT::T2849276709"] = "PDF-Inhalt als Eingabe verwenden"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T298062956"] = "Bilder werden derzeit nicht unterstützt"
|
||||||
|
|
||||||
-- Select PDF file
|
-- Use file content as input
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READPDFCONTENT::T63272795"] = "PDF-Datei auswählen"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T3499386973"] = "Dokumenteninhalt als Eingabe verwenden"
|
||||||
|
|
||||||
|
-- Select file to read its content
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T354817589"] = "Datei auswählen, um den Inhalt zu lesen"
|
||||||
|
|
||||||
|
-- Executables are not allowed
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T4167762413"] = "Ausführbare Dateien sind nicht erlaubt"
|
||||||
|
|
||||||
-- The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used.
|
-- The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used.
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READWEBCONTENT::T1164201762"] = "Der Inhalt wird mithilfe eines LLM-Agents bereinigt: Der Hauptinhalt wird extrahiert, Werbung und andere irrelevante Elemente werden nach Möglichkeit entfernt. Relative Links werden nach Möglichkeit in absolute Links umgewandelt, damit sie verwendet werden können."
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READWEBCONTENT::T1164201762"] = "Der Inhalt wird mithilfe eines LLM-Agents bereinigt: Der Hauptinhalt wird extrahiert, Werbung und andere irrelevante Elemente werden nach Möglichkeit entfernt. Relative Links werden nach Möglichkeit in absolute Links umgewandelt, damit sie verwendet werden können."
|
||||||
@ -5484,6 +5490,9 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T304
|
|||||||
-- AI-based data source selection with AI retrieval context validation
|
-- AI-based data source selection with AI retrieval context validation
|
||||||
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "KI-basierte Datenquellen-Auswahl mit Validierung des Abrufkontexts"
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "KI-basierte Datenquellen-Auswahl mit Validierung des Abrufkontexts"
|
||||||
|
|
||||||
|
-- Executable Files
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2217313358"] = "Ausführbare Dateien"
|
||||||
|
|
||||||
-- PDF Files
|
-- PDF Files
|
||||||
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF-Dateien"
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF-Dateien"
|
||||||
|
|
||||||
|
@ -1650,11 +1650,17 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROFILESELECTION::T918741365"] = "You can
|
|||||||
-- Provider
|
-- Provider
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROVIDERSELECTION::T900237532"] = "Provider"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::PROVIDERSELECTION::T900237532"] = "Provider"
|
||||||
|
|
||||||
-- Use PDF content as input
|
-- Images are not supported yet
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READPDFCONTENT::T2849276709"] = "Use PDF content as input"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T298062956"] = "Images are not supported yet"
|
||||||
|
|
||||||
-- Select PDF file
|
-- Use file content as input
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READPDFCONTENT::T63272795"] = "Select PDF file"
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T3499386973"] = "Use file content as input"
|
||||||
|
|
||||||
|
-- Select file to read its content
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T354817589"] = "Select file to read its content"
|
||||||
|
|
||||||
|
-- Executables are not allowed
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READFILECONTENT::T4167762413"] = "Executables are not allowed"
|
||||||
|
|
||||||
-- The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used.
|
-- The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used.
|
||||||
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READWEBCONTENT::T1164201762"] = "The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used."
|
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::READWEBCONTENT::T1164201762"] = "The content is cleaned using an LLM agent: the main content is extracted, advertisements and other irrelevant things are attempted to be removed; relative links are attempted to be converted into absolute links so that they can be used."
|
||||||
@ -5484,6 +5490,9 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T304
|
|||||||
-- AI-based data source selection with AI retrieval context validation
|
-- AI-based data source selection with AI retrieval context validation
|
||||||
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "AI-based data source selection with AI retrieval context validation"
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RAG::RAGPROCESSES::AISRCSELWITHRETCTXVAL::T3775725978"] = "AI-based data source selection with AI retrieval context validation"
|
||||||
|
|
||||||
|
-- Executable Files
|
||||||
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T2217313358"] = "Executable Files"
|
||||||
|
|
||||||
-- PDF Files
|
-- PDF Files
|
||||||
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF Files"
|
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPEFILTER::T3108466742"] = "PDF Files"
|
||||||
|
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamDocumentMetadata : ContentStreamSseMetadata;
|
@ -0,0 +1,4 @@
|
|||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamImageMetadata: ContentStreamSseMetadata;
|
@ -0,0 +1,32 @@
|
|||||||
|
using System.Text.Json;
|
||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
public sealed class ContentStreamMetadataJsonConverter : JsonConverter<ContentStreamSseMetadata>
|
||||||
|
{
|
||||||
|
public override ContentStreamSseMetadata? Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
|
||||||
|
{
|
||||||
|
using var jsonDoc = JsonDocument.ParseValue(ref reader);
|
||||||
|
var root = jsonDoc.RootElement;
|
||||||
|
var rawText = root.GetRawText();
|
||||||
|
|
||||||
|
var propertyName = root.EnumerateObject()
|
||||||
|
.Select(p => p.Name)
|
||||||
|
.FirstOrDefault();
|
||||||
|
|
||||||
|
return propertyName switch
|
||||||
|
{
|
||||||
|
"Text" => JsonSerializer.Deserialize<ContentStreamTextMetadata?>(rawText, options),
|
||||||
|
"Pdf" => JsonSerializer.Deserialize<ContentStreamPdfMetadata?>(rawText, options),
|
||||||
|
"Spreadsheet" => JsonSerializer.Deserialize<ContentStreamSpreadsheetMetadata?>(rawText, options),
|
||||||
|
"Presentation" => JsonSerializer.Deserialize<ContentStreamPresentationMetadata?>(rawText, options),
|
||||||
|
"Image" => JsonSerializer.Deserialize<ContentStreamImageMetadata?>(rawText, options),
|
||||||
|
"Document" => JsonSerializer.Deserialize<ContentStreamDocumentMetadata?>(rawText, options),
|
||||||
|
|
||||||
|
_ => null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public override void Write(Utf8JsonWriter writer, ContentStreamSseMetadata value, JsonSerializerOptions options) => JsonSerializer.Serialize(writer, value, value.GetType(), options);
|
||||||
|
}
|
11
app/MindWork AI Studio/Tools/ContentStreamPdfDetails.cs
Normal file
11
app/MindWork AI Studio/Tools/ContentStreamPdfDetails.cs
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable UnusedAutoPropertyAccessor.Global
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamPdfDetails
|
||||||
|
{
|
||||||
|
[JsonPropertyName("page_number")]
|
||||||
|
public int? PageNumber { get; init; }
|
||||||
|
}
|
11
app/MindWork AI Studio/Tools/ContentStreamPdfMetadata.cs
Normal file
11
app/MindWork AI Studio/Tools/ContentStreamPdfMetadata.cs
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable UnusedAutoPropertyAccessor.Global
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamPdfMetadata : ContentStreamSseMetadata
|
||||||
|
{
|
||||||
|
[JsonPropertyName("Pdf")]
|
||||||
|
public ContentStreamPdfDetails? Pdf { get; init; }
|
||||||
|
}
|
18
app/MindWork AI Studio/Tools/ContentStreamPptxImageData.cs
Normal file
18
app/MindWork AI Studio/Tools/ContentStreamPptxImageData.cs
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
public sealed class ContentStreamPptxImageData
|
||||||
|
{
|
||||||
|
[JsonPropertyName("id")]
|
||||||
|
public string? Id { get; init; }
|
||||||
|
|
||||||
|
[JsonPropertyName("content")]
|
||||||
|
public string? Content { get; init; }
|
||||||
|
|
||||||
|
[JsonPropertyName("segment")]
|
||||||
|
public int? Segment { get; init; }
|
||||||
|
|
||||||
|
[JsonPropertyName("is_end")]
|
||||||
|
public bool IsEnd { get; init; }
|
||||||
|
}
|
@ -0,0 +1,14 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable UnusedAutoPropertyAccessor.Global
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamPresentationDetails
|
||||||
|
{
|
||||||
|
[JsonPropertyName("slide_number")]
|
||||||
|
public int? SlideNumber { get; init; }
|
||||||
|
|
||||||
|
[JsonPropertyName("image")]
|
||||||
|
public ContentStreamPptxImageData? Image { get; init; }
|
||||||
|
}
|
@ -0,0 +1,11 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable UnusedAutoPropertyAccessor.Global
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamPresentationMetadata : ContentStreamSseMetadata
|
||||||
|
{
|
||||||
|
[JsonPropertyName("Presentation")]
|
||||||
|
public ContentStreamPresentationDetails? Presentation { get; init; }
|
||||||
|
}
|
@ -0,0 +1,14 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable UnusedAutoPropertyAccessor.Global
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamSpreadsheetDetails
|
||||||
|
{
|
||||||
|
[JsonPropertyName("sheet_name")]
|
||||||
|
public string? SheetName { get; init; }
|
||||||
|
|
||||||
|
[JsonPropertyName("row_number")]
|
||||||
|
public int? RowNumber { get; init; }
|
||||||
|
}
|
@ -0,0 +1,11 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable UnusedAutoPropertyAccessor.Global
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamSpreadsheetMetadata : ContentStreamSseMetadata
|
||||||
|
{
|
||||||
|
[JsonPropertyName("Spreadsheet")]
|
||||||
|
public ContentStreamSpreadsheetDetails? Spreadsheet { get; init; }
|
||||||
|
}
|
15
app/MindWork AI Studio/Tools/ContentStreamSseEvent.cs
Normal file
15
app/MindWork AI Studio/Tools/ContentStreamSseEvent.cs
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
public sealed class ContentStreamSseEvent
|
||||||
|
{
|
||||||
|
[JsonPropertyName("content")]
|
||||||
|
public string? Content { get; init; }
|
||||||
|
|
||||||
|
[JsonPropertyName("stream_id")]
|
||||||
|
public string? StreamId { get; init; }
|
||||||
|
|
||||||
|
[JsonPropertyName("metadata")]
|
||||||
|
public ContentStreamSseMetadata? Metadata { get; init; }
|
||||||
|
}
|
121
app/MindWork AI Studio/Tools/ContentStreamSseHandler.cs
Normal file
121
app/MindWork AI Studio/Tools/ContentStreamSseHandler.cs
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
using System.Collections.Concurrent;
|
||||||
|
using System.Text;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
public static class ContentStreamSseHandler
|
||||||
|
{
|
||||||
|
private static readonly ConcurrentDictionary<string, List<ContentStreamPptxImageData>> CHUNKED_IMAGES = new();
|
||||||
|
private static readonly ConcurrentDictionary<string, int> CURRENT_SLIDE_NUMBERS = new();
|
||||||
|
|
||||||
|
public static string ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true)
|
||||||
|
{
|
||||||
|
switch (sseEvent)
|
||||||
|
{
|
||||||
|
case { Content: not null, Metadata: not null }:
|
||||||
|
switch (sseEvent.Metadata)
|
||||||
|
{
|
||||||
|
case ContentStreamTextMetadata:
|
||||||
|
return $"{sseEvent.Content}\n";
|
||||||
|
|
||||||
|
case ContentStreamPdfMetadata pdfMetadata:
|
||||||
|
var pageNumber = pdfMetadata.Pdf?.PageNumber ?? 0;
|
||||||
|
return $"# Page {pageNumber}\n{sseEvent.Content}";
|
||||||
|
|
||||||
|
case ContentStreamSpreadsheetMetadata spreadsheetMetadata:
|
||||||
|
var sheetName = spreadsheetMetadata.Spreadsheet?.SheetName;
|
||||||
|
var rowNumber = spreadsheetMetadata.Spreadsheet?.RowNumber;
|
||||||
|
var spreadSheetResult = new StringBuilder();
|
||||||
|
if (rowNumber == 1)
|
||||||
|
spreadSheetResult.AppendLine($"\n# {sheetName}");
|
||||||
|
|
||||||
|
spreadSheetResult.AppendLine($"{sseEvent.Content}");
|
||||||
|
return spreadSheetResult.ToString();
|
||||||
|
|
||||||
|
case ContentStreamDocumentMetadata:
|
||||||
|
case ContentStreamImageMetadata:
|
||||||
|
return $"{sseEvent.Content}";
|
||||||
|
|
||||||
|
case ContentStreamPresentationMetadata presentationMetadata:
|
||||||
|
var slideNumber = presentationMetadata.Presentation?.SlideNumber ?? 0;
|
||||||
|
var image = presentationMetadata.Presentation?.Image ?? null;
|
||||||
|
var presentationResult = new StringBuilder();
|
||||||
|
var streamId = sseEvent.StreamId;
|
||||||
|
|
||||||
|
CURRENT_SLIDE_NUMBERS.TryGetValue(streamId!, out var currentSlideNumber);
|
||||||
|
|
||||||
|
if (slideNumber != currentSlideNumber)
|
||||||
|
presentationResult.AppendLine($"# Slide {slideNumber}");
|
||||||
|
|
||||||
|
presentationResult.Append($"{sseEvent.Content}");
|
||||||
|
|
||||||
|
if (image is not null)
|
||||||
|
{
|
||||||
|
var imageId = $"{streamId}-{image.Id!}";
|
||||||
|
var isEnd = ProcessImageSegment(imageId, image);
|
||||||
|
if (isEnd && extractImages)
|
||||||
|
presentationResult.AppendLine(BuildImage(imageId));
|
||||||
|
}
|
||||||
|
|
||||||
|
CURRENT_SLIDE_NUMBERS[streamId!] = slideNumber;
|
||||||
|
|
||||||
|
return presentationResult.ToString();
|
||||||
|
default:
|
||||||
|
return sseEvent.Content;
|
||||||
|
}
|
||||||
|
|
||||||
|
case { Content: not null, Metadata: null }:
|
||||||
|
return sseEvent.Content;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return string.Empty;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool ProcessImageSegment(string imageId, ContentStreamPptxImageData contentStreamPptxImageData)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(contentStreamPptxImageData.Id) || string.IsNullOrWhiteSpace(imageId))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
var segment = contentStreamPptxImageData.Segment ?? 0;
|
||||||
|
var content = contentStreamPptxImageData.Content ?? string.Empty;
|
||||||
|
var isEnd = contentStreamPptxImageData.IsEnd;
|
||||||
|
|
||||||
|
var imageSegment = new ContentStreamPptxImageData
|
||||||
|
{
|
||||||
|
Id = imageId,
|
||||||
|
Content = content,
|
||||||
|
Segment = segment,
|
||||||
|
IsEnd = isEnd,
|
||||||
|
};
|
||||||
|
|
||||||
|
CHUNKED_IMAGES.AddOrUpdate(
|
||||||
|
imageId,
|
||||||
|
_ => [imageSegment],
|
||||||
|
(_, existingList) =>
|
||||||
|
{
|
||||||
|
existingList.Add(imageSegment);
|
||||||
|
return existingList;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
return isEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string BuildImage(string id)
|
||||||
|
{
|
||||||
|
if (!CHUNKED_IMAGES.TryGetValue(id, out var imageSegments))
|
||||||
|
return string.Empty;
|
||||||
|
|
||||||
|
var sortedSegments = imageSegments
|
||||||
|
.OrderBy(item => item.Segment)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var base64Image = string.Join(string.Empty, sortedSegments
|
||||||
|
.Where(item => item.Content != null)
|
||||||
|
.Select(item => item.Content));
|
||||||
|
|
||||||
|
CHUNKED_IMAGES.Remove(id, out _);
|
||||||
|
return base64Image;
|
||||||
|
}
|
||||||
|
}
|
6
app/MindWork AI Studio/Tools/ContentStreamSseMetadata.cs
Normal file
6
app/MindWork AI Studio/Tools/ContentStreamSseMetadata.cs
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
[JsonConverter(typeof(ContentStreamMetadataJsonConverter))]
|
||||||
|
public abstract class ContentStreamSseMetadata;
|
10
app/MindWork AI Studio/Tools/ContentStreamTextDetails.cs
Normal file
10
app/MindWork AI Studio/Tools/ContentStreamTextDetails.cs
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamTextDetails
|
||||||
|
{
|
||||||
|
[JsonPropertyName("line_number")]
|
||||||
|
public int? LineNumber { get; init; }
|
||||||
|
}
|
10
app/MindWork AI Studio/Tools/ContentStreamTextMetadata.cs
Normal file
10
app/MindWork AI Studio/Tools/ContentStreamTextMetadata.cs
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace AIStudio.Tools;
|
||||||
|
|
||||||
|
// ReSharper disable ClassNeverInstantiated.Global
|
||||||
|
public sealed class ContentStreamTextMetadata : ContentStreamSseMetadata
|
||||||
|
{
|
||||||
|
[JsonPropertyName("Text")]
|
||||||
|
public ContentStreamTextDetails? Text { get; init; }
|
||||||
|
}
|
@ -4,5 +4,5 @@ namespace AIStudio.Tools;
|
|||||||
|
|
||||||
public readonly record struct EnterpriseEnvironment(string ConfigurationServerUrl, Guid ConfigurationId, EntityTagHeaderValue? ETag)
|
public readonly record struct EnterpriseEnvironment(string ConfigurationServerUrl, Guid ConfigurationId, EntityTagHeaderValue? ETag)
|
||||||
{
|
{
|
||||||
public bool IsActive => !string.IsNullOrEmpty(this.ConfigurationServerUrl) && this.ConfigurationId != Guid.Empty;
|
public bool IsActive => !string.IsNullOrWhiteSpace(this.ConfigurationServerUrl) && this.ConfigurationId != Guid.Empty;
|
||||||
}
|
}
|
@ -20,4 +20,6 @@ public readonly record struct FileTypeFilter(string FilterName, string[] FilterE
|
|||||||
public static FileTypeFilter AllOffice => new(TB("All Office Files"), ["docx", "xlsx", "pptx", "doc", "xls", "ppt", "pdf"]);
|
public static FileTypeFilter AllOffice => new(TB("All Office Files"), ["docx", "xlsx", "pptx", "doc", "xls", "ppt", "pdf"]);
|
||||||
|
|
||||||
public static FileTypeFilter AllImages => new(TB("All Image Files"), ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]);
|
public static FileTypeFilter AllImages => new(TB("All Image Files"), ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]);
|
||||||
|
|
||||||
|
public static FileTypeFilter Executables => new(TB("Executable Files"), ["exe", "app", "bin", "appimage"]);
|
||||||
}
|
}
|
@ -1,16 +1,52 @@
|
|||||||
|
using System.Text;
|
||||||
|
using System.Text.Json;
|
||||||
|
|
||||||
namespace AIStudio.Tools.Services;
|
namespace AIStudio.Tools.Services;
|
||||||
|
|
||||||
public sealed partial class RustService
|
public sealed partial class RustService
|
||||||
{
|
{
|
||||||
public async Task<string> GetPDFText(string filePath)
|
public async Task<string> ReadArbitraryFileData(string path, string streamId, int maxChunks)
|
||||||
{
|
{
|
||||||
var response = await this.http.GetAsync($"/retrieval/fs/read/pdf?file_path={filePath}");
|
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}";
|
||||||
|
var request = new HttpRequestMessage(HttpMethod.Get, requestUri);
|
||||||
|
var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
||||||
|
|
||||||
if (!response.IsSuccessStatusCode)
|
if (!response.IsSuccessStatusCode)
|
||||||
{
|
|
||||||
this.logger!.LogError($"Failed to read the PDF file due to an network error: '{response.StatusCode}'");
|
|
||||||
return string.Empty;
|
return string.Empty;
|
||||||
|
|
||||||
|
await using var stream = await response.Content.ReadAsStreamAsync();
|
||||||
|
using var reader = new StreamReader(stream);
|
||||||
|
|
||||||
|
var resultBuilder = new StringBuilder();
|
||||||
|
var chunkCount = 0;
|
||||||
|
|
||||||
|
while (!reader.EndOfStream && chunkCount < maxChunks)
|
||||||
|
{
|
||||||
|
var line = await reader.ReadLineAsync();
|
||||||
|
if (string.IsNullOrWhiteSpace(line))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (!line.StartsWith("data:", StringComparison.InvariantCulture))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var jsonContent = line[5..];
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent);
|
||||||
|
if (sseEvent is not null)
|
||||||
|
{
|
||||||
|
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, false);
|
||||||
|
resultBuilder.Append(content);
|
||||||
|
chunkCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (JsonException)
|
||||||
|
{
|
||||||
|
this.logger?.LogError("Failed to deserialize SSE event: {JsonContent}", jsonContent);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return await response.Content.ReadAsStringAsync();
|
return resultBuilder.ToString();
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -23,16 +23,35 @@ use tokio_stream::wrappers::ReceiverStream;
|
|||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
pub struct Chunk {
|
pub struct Chunk {
|
||||||
pub content: String,
|
pub content: String,
|
||||||
|
pub stream_id: String,
|
||||||
pub metadata: Metadata,
|
pub metadata: Metadata,
|
||||||
}
|
}
|
||||||
|
impl Chunk {
|
||||||
|
pub fn new(content: String, metadata: Metadata) -> Self {
|
||||||
|
Chunk { content, stream_id: String::new(), metadata }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_stream_id(&mut self, stream_id: &str) { self.stream_id = stream_id.to_string(); }
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
pub enum Metadata {
|
pub enum Metadata {
|
||||||
Text { line_number: usize },
|
Text {
|
||||||
Pdf { page_number: usize },
|
line_number: usize
|
||||||
Spreadsheet { sheet_name: String, row_number: usize },
|
},
|
||||||
Document,
|
|
||||||
Image,
|
Pdf {
|
||||||
|
page_number: usize
|
||||||
|
},
|
||||||
|
|
||||||
|
Spreadsheet {
|
||||||
|
sheet_name: String,
|
||||||
|
row_number: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
Document {},
|
||||||
|
Image {},
|
||||||
|
|
||||||
Presentation {
|
Presentation {
|
||||||
slide_number: u32,
|
slide_number: u32,
|
||||||
image: Option<Base64Image>,
|
image: Option<Base64Image>,
|
||||||
@ -61,18 +80,23 @@ const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token
|
|||||||
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
||||||
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
||||||
|
|
||||||
#[get("/retrieval/fs/extract?<path>")]
|
#[get("/retrieval/fs/extract?<path>&<stream_id>")]
|
||||||
pub async fn extract_data(_token: APIToken, path: String, mut end: Shutdown) -> EventStream![] {
|
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut end: Shutdown) -> EventStream![] {
|
||||||
EventStream! {
|
EventStream! {
|
||||||
let stream_result = stream_data(&path).await;
|
let stream_result = stream_data(&path).await;
|
||||||
|
let id_ref = &stream_id;
|
||||||
|
|
||||||
match stream_result {
|
match stream_result {
|
||||||
Ok(mut stream) => {
|
Ok(mut stream) => {
|
||||||
loop {
|
loop {
|
||||||
let chunk = select! {
|
let chunk = select! {
|
||||||
chunk = stream.next() => match chunk {
|
chunk = stream.next() => match chunk {
|
||||||
Some(Ok(chunk)) => chunk,
|
Some(Ok(mut chunk)) => {
|
||||||
|
chunk.set_stream_id(id_ref);
|
||||||
|
chunk
|
||||||
|
},
|
||||||
Some(Err(e)) => {
|
Some(Err(e)) => {
|
||||||
yield Event::json(&format!("Error: {}", e));
|
yield Event::json(&format!("Error: {e}"));
|
||||||
break;
|
break;
|
||||||
},
|
},
|
||||||
None => break,
|
None => break,
|
||||||
@ -85,7 +109,7 @@ pub async fn extract_data(_token: APIToken, path: String, mut end: Shutdown) ->
|
|||||||
},
|
},
|
||||||
|
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
yield Event::json(&format!("Error starting stream: {}", e));
|
yield Event::json(&format!("Error starting stream: {e}"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -160,40 +184,16 @@ async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
|
|||||||
let stream = stream! {
|
let stream = stream! {
|
||||||
while let Ok(Some(line)) = lines.next_line().await {
|
while let Ok(Some(line)) = lines.next_line().await {
|
||||||
line_number += 1;
|
line_number += 1;
|
||||||
yield Ok(Chunk {
|
yield Ok(Chunk::new(
|
||||||
content: line,
|
line,
|
||||||
metadata: Metadata::Text { line_number },
|
Metadata::Text { line_number }
|
||||||
});
|
));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Box::pin(stream))
|
Ok(Box::pin(stream))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[get("/retrieval/fs/read/pdf?<file_path>")]
|
|
||||||
pub fn read_pdf(_token: APIToken, file_path: String) -> String {
|
|
||||||
let pdfium = Pdfium::ai_studio_init();
|
|
||||||
let doc = match pdfium.load_pdf_from_file(&file_path, None) {
|
|
||||||
Ok(document) => document,
|
|
||||||
Err(e) => return e.to_string(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut pdf_content = String::new();
|
|
||||||
for page in doc.pages().iter() {
|
|
||||||
let content = match page.text().map(|text_content| text_content.all()) {
|
|
||||||
Ok(content) => content,
|
|
||||||
Err(_) => {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
pdf_content.push_str(&content);
|
|
||||||
pdf_content.push_str("\n\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
pdf_content
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
|
async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
|
||||||
let path = file_path.to_owned();
|
let path = file_path.to_owned();
|
||||||
let (tx, rx) = mpsc::channel(10);
|
let (tx, rx) = mpsc::channel(10);
|
||||||
@ -217,10 +217,10 @@ async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if tx.blocking_send(Ok(Chunk {
|
if tx.blocking_send(Ok(Chunk::new(
|
||||||
content,
|
content,
|
||||||
metadata: Metadata::Pdf { page_number: num_page + 1 },
|
Metadata::Pdf { page_number: num_page + 1 }
|
||||||
})).is_err() {
|
))).is_err() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -257,13 +257,13 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result<ChunkStream> {
|
|||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.join(",");
|
.join(",");
|
||||||
|
|
||||||
if tx.blocking_send(Ok(Chunk {
|
if tx.blocking_send(Ok(Chunk::new(
|
||||||
content,
|
content,
|
||||||
metadata: Metadata::Spreadsheet {
|
Metadata::Spreadsheet {
|
||||||
sheet_name: sheet_name.clone(),
|
sheet_name: sheet_name.clone(),
|
||||||
row_number: row_idx + 1,
|
row_number: row_idx + 1,
|
||||||
},
|
}
|
||||||
})).is_err() {
|
))).is_err() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -288,10 +288,10 @@ async fn convert_with_pandoc(
|
|||||||
let stream = stream! {
|
let stream = stream! {
|
||||||
if output.status.success() {
|
if output.status.success() {
|
||||||
match String::from_utf8(output.stdout.clone()) {
|
match String::from_utf8(output.stdout.clone()) {
|
||||||
Ok(content) => yield Ok(Chunk {
|
Ok(content) => yield Ok(Chunk::new(
|
||||||
content,
|
content,
|
||||||
metadata: Metadata::Document,
|
Metadata::Document {}
|
||||||
}),
|
)),
|
||||||
Err(e) => yield Err(e.into()),
|
Err(e) => yield Err(e.into()),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -310,10 +310,10 @@ async fn chunk_image(file_path: &str) -> Result<ChunkStream> {
|
|||||||
let base64 = general_purpose::STANDARD.encode(&data);
|
let base64 = general_purpose::STANDARD.encode(&data);
|
||||||
|
|
||||||
let stream = stream! {
|
let stream = stream! {
|
||||||
yield Ok(Chunk {
|
yield Ok(Chunk::new(
|
||||||
content: base64,
|
base64,
|
||||||
metadata: Metadata::Image,
|
Metadata::Image {},
|
||||||
});
|
));
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Box::pin(stream))
|
Ok(Box::pin(stream))
|
||||||
@ -340,13 +340,13 @@ async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
|||||||
match slide_result {
|
match slide_result {
|
||||||
Ok(slide) => {
|
Ok(slide) => {
|
||||||
if let Some(md_content) = slide.convert_to_md() {
|
if let Some(md_content) = slide.convert_to_md() {
|
||||||
let chunk = Chunk {
|
let chunk = Chunk::new(
|
||||||
content: md_content,
|
md_content,
|
||||||
metadata: Metadata::Presentation {
|
Metadata::Presentation {
|
||||||
slide_number: slide.slide_number,
|
slide_number: slide.slide_number,
|
||||||
image: None,
|
image: None,
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
|
|
||||||
if tx.send(Ok(chunk)).await.is_err() {
|
if tx.send(Ok(chunk)).await.is_err() {
|
||||||
break;
|
break;
|
||||||
@ -373,13 +373,13 @@ async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
|||||||
is_end
|
is_end
|
||||||
);
|
);
|
||||||
|
|
||||||
let chunk = Chunk {
|
let chunk = Chunk::new(
|
||||||
content: String::new(),
|
String::new(),
|
||||||
metadata: Metadata::Presentation {
|
Metadata::Presentation {
|
||||||
slide_number: slide.slide_number,
|
slide_number: slide.slide_number,
|
||||||
image: Some(base64_image),
|
image: Some(base64_image),
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
|
|
||||||
if tx.send(Ok(chunk)).await.is_err() {
|
if tx.send(Ok(chunk)).await.is_err() {
|
||||||
break;
|
break;
|
||||||
|
@ -82,7 +82,6 @@ pub fn start_runtime_api() {
|
|||||||
crate::environment::delete_enterprise_env_config_id,
|
crate::environment::delete_enterprise_env_config_id,
|
||||||
crate::environment::read_enterprise_env_config_server_url,
|
crate::environment::read_enterprise_env_config_server_url,
|
||||||
crate::file_data::extract_data,
|
crate::file_data::extract_data,
|
||||||
crate::file_data::read_pdf,
|
|
||||||
crate::log::get_log_paths,
|
crate::log::get_log_paths,
|
||||||
])
|
])
|
||||||
.ignite().await.unwrap()
|
.ignite().await.unwrap()
|
||||||
|
Loading…
Reference in New Issue
Block a user