diff --git a/app/MindWork AI Studio/Components/ReadFileContent.razor.cs b/app/MindWork AI Studio/Components/ReadFileContent.razor.cs index f01b8bc8..86bafebe 100644 --- a/app/MindWork AI Studio/Components/ReadFileContent.razor.cs +++ b/app/MindWork AI Studio/Components/ReadFileContent.razor.cs @@ -38,8 +38,7 @@ public partial class ReadFileContent : MSGComponentBase return; } - var streamId = Guid.NewGuid().ToString(); - var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, streamId, int.MaxValue); + var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, int.MaxValue); await this.FileContentChanged.InvokeAsync(fileContent); } } \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/ContentStreamSseHandler.cs b/app/MindWork AI Studio/Tools/ContentStreamSseHandler.cs index ba640f01..4090d49b 100644 --- a/app/MindWork AI Studio/Tools/ContentStreamSseHandler.cs +++ b/app/MindWork AI Studio/Tools/ContentStreamSseHandler.cs @@ -8,7 +8,7 @@ public static class ContentStreamSseHandler private static readonly ConcurrentDictionary> CHUNKED_IMAGES = new(); private static readonly ConcurrentDictionary CURRENT_SLIDE_NUMBERS = new(); - public static string ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true) + public static string? ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true) { switch (sseEvent) { @@ -16,40 +16,50 @@ public static class ContentStreamSseHandler switch (sseEvent.Metadata) { case ContentStreamTextMetadata: - return $"{sseEvent.Content}\n"; + return sseEvent.Content; case ContentStreamPdfMetadata pdfMetadata: var pageNumber = pdfMetadata.Pdf?.PageNumber ?? 0; - return $"# Page {pageNumber}\n{sseEvent.Content}"; + return $""" + # Page {pageNumber} + {sseEvent.Content} + + """; case ContentStreamSpreadsheetMetadata spreadsheetMetadata: var sheetName = spreadsheetMetadata.Spreadsheet?.SheetName; var rowNumber = spreadsheetMetadata.Spreadsheet?.RowNumber; var spreadSheetResult = new StringBuilder(); - if (rowNumber == 1) - spreadSheetResult.AppendLine($"\n# {sheetName}"); - - spreadSheetResult.AppendLine($"{sseEvent.Content}"); + if (rowNumber == 0) + { + spreadSheetResult.AppendLine(); + spreadSheetResult.AppendLine($"# {sheetName}"); + } + + spreadSheetResult.Append(sseEvent.Content); return spreadSheetResult.ToString(); case ContentStreamDocumentMetadata: case ContentStreamImageMetadata: - return $"{sseEvent.Content}"; + return sseEvent.Content; case ContentStreamPresentationMetadata presentationMetadata: var slideNumber = presentationMetadata.Presentation?.SlideNumber ?? 0; var image = presentationMetadata.Presentation?.Image ?? null; var presentationResult = new StringBuilder(); var streamId = sseEvent.StreamId; - + CURRENT_SLIDE_NUMBERS.TryGetValue(streamId!, out var currentSlideNumber); - if (slideNumber != currentSlideNumber) + { + presentationResult.AppendLine(); presentationResult.AppendLine($"# Slide {slideNumber}"); + } - presentationResult.Append($"{sseEvent.Content}"); - - if (image is not null) + if(!string.IsNullOrWhiteSpace(sseEvent.Content)) + presentationResult.AppendLine(sseEvent.Content); + + if (extractImages && image is not null) { var imageId = $"{streamId}-{image.Id!}"; var isEnd = ProcessImageSegment(imageId, image); @@ -58,8 +68,8 @@ public static class ContentStreamSseHandler } CURRENT_SLIDE_NUMBERS[streamId!] = slideNumber; - - return presentationResult.ToString(); + return presentationResult.Length is 0 ? null : presentationResult.ToString(); + default: return sseEvent.Content; } @@ -68,7 +78,7 @@ public static class ContentStreamSseHandler return sseEvent.Content; default: - return string.Empty; + return null; } } diff --git a/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs b/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs index 3d43eb72..cdd8e0cf 100644 --- a/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs +++ b/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs @@ -5,9 +5,10 @@ namespace AIStudio.Tools.Services; public sealed partial class RustService { - public async Task ReadArbitraryFileData(string path, string streamId, int maxChunks) + public async Task ReadArbitraryFileData(string path, int maxChunks, bool extractImages = false) { - var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}"; + var streamId = Guid.NewGuid().ToString(); + var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}&extract_images={extractImages}"; var request = new HttpRequestMessage(HttpMethod.Get, requestUri); var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead); @@ -36,8 +37,10 @@ public sealed partial class RustService var sseEvent = JsonSerializer.Deserialize(jsonContent); if (sseEvent is not null) { - var content = ContentStreamSseHandler.ProcessEvent(sseEvent, false); - resultBuilder.Append(content); + var content = ContentStreamSseHandler.ProcessEvent(sseEvent, extractImages); + if(content is not null) + resultBuilder.AppendLine(content); + chunkCount++; } } diff --git a/app/MindWork AI Studio/wwwroot/changelog/v0.9.49.md b/app/MindWork AI Studio/wwwroot/changelog/v0.9.49.md index 49fd58ce..6d60292a 100644 --- a/app/MindWork AI Studio/wwwroot/changelog/v0.9.49.md +++ b/app/MindWork AI Studio/wwwroot/changelog/v0.9.49.md @@ -1,5 +1,6 @@ # v0.9.49, build 224 (2025-06-xx xx:xx UTC) -- Added a library by Nils Kruthoff (`nilskruthoff`) that allows AI Studio to read PowerPoint files. This feature is not yet available in the UI, but it will soon be available. Thanks, Nils, for that great contribution. +- Added a library by Nils Kruthoff (`nilskruthoff`) that allows AI Studio to read PowerPoint files. Thanks, Nils, for that great contribution. +- Added support for loading arbitrary document data into some assistants. This functionality replaces the previous PDF reading function and was contributed by Nils as well. - Improved the loading of some components that require data fetching, resulting in a more responsive UI. - Improved some awkward phrasings in English and German. - Improved the implementation of configuration plugins to enhance long-term maintainability. diff --git a/runtime/src/file_data.rs b/runtime/src/file_data.rs index 7333f963..f05b18b5 100644 --- a/runtime/src/file_data.rs +++ b/runtime/src/file_data.rs @@ -16,6 +16,7 @@ use rocket::tokio::select; use rocket::Shutdown; use std::path::Path; use std::pin::Pin; +use log::{debug, error}; use tokio::io::AsyncBufReadExt; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; @@ -80,10 +81,10 @@ const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token type Result = std::result::Result>; type ChunkStream = Pin> + Send>>; -#[get("/retrieval/fs/extract?&")] -pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut end: Shutdown) -> EventStream![] { +#[get("/retrieval/fs/extract?&&")] +pub async fn extract_data(_token: APIToken, path: String, stream_id: String, extract_images: bool, mut end: Shutdown) -> EventStream![] { EventStream! { - let stream_result = stream_data(&path).await; + let stream_result = stream_data(&path, extract_images).await; let id_ref = &stream_id; match stream_result { @@ -115,24 +116,35 @@ pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut } } -async fn stream_data(file_path: &str) -> Result { +async fn stream_data(file_path: &str, extract_images: bool) -> Result { if !Path::new(file_path).exists() { + error!("File does not exist: '{file_path}'"); return Err("File does not exist.".into()); } let file_path_clone = file_path.to_owned(); - let fmt = tokio::task::spawn_blocking(move || { - FileFormat::from_file(&file_path_clone) - }).await??; + let fmt = match FileFormat::from_file(&file_path_clone) { + Ok(format) => format, + Err(error) => { + error!("Failed to determine file format for '{file_path}': {error}"); + return Err(format!("Failed to determine file format for '{file_path}': {error}").into()); + }, + }; let ext = file_path.split('.').next_back().unwrap_or(""); + debug!("Extracting data from file: '{file_path}', format: '{fmt:?}', extension: '{ext}'"); + let stream = match ext { DOCX | ODT => { let from = if ext == DOCX { "docx" } else { "odt" }; convert_with_pandoc(file_path, from, TO_MARKDOWN).await? } - "pptx" => stream_pptx(file_path).await?, + "csv" | "tsv" => { + stream_text_file(file_path, true, Some("csv".to_string())).await? + }, + + "pptx" => stream_pptx(file_path, extract_images).await?, "xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => { stream_spreadsheet_as_csv(file_path).await? @@ -141,47 +153,77 @@ async fn stream_data(file_path: &str) -> Result { _ => match fmt.kind() { Kind::Document => match fmt { FileFormat::PortableDocumentFormat => stream_pdf(file_path).await?, + FileFormat::MicrosoftWordDocument => { convert_with_pandoc(file_path, "docx", TO_MARKDOWN).await? - } + }, + FileFormat::OfficeOpenXmlDocument => { convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await? - } - _ => stream_text_file(file_path).await?, + }, + + _ => stream_text_file(file_path, false, None).await?, }, Kind::Ebook => return Err("Ebooks not yet supported".into()), - Kind::Image => chunk_image(file_path).await?, + + Kind::Image => { + if !extract_images { + return Err("Image extraction is disabled.".into()); + } + + chunk_image(file_path).await? + }, Kind::Other => match fmt { FileFormat::HypertextMarkupLanguage => { convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await? - } - _ => stream_text_file(file_path).await?, + }, + + _ => stream_text_file(file_path, false, None).await?, }, Kind::Presentation => match fmt { FileFormat::OfficeOpenXmlPresentation => { - stream_pptx(file_path).await? - } - _ => stream_text_file(file_path).await?, + stream_pptx(file_path, extract_images).await? + }, + + _ => stream_text_file(file_path, false, None).await?, }, Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?, - _ => stream_text_file(file_path).await?, + + _ => stream_text_file(file_path, false, None).await?, }, }; Ok(Box::pin(stream)) } -async fn stream_text_file(file_path: &str) -> Result { +async fn stream_text_file(file_path: &str, use_md_fences: bool, fence_language: Option) -> Result { let file = tokio::fs::File::open(file_path).await?; let reader = tokio::io::BufReader::new(file); let mut lines = reader.lines(); let mut line_number = 0; let stream = stream! { + + if use_md_fences { + match fence_language { + Some(lang) if lang.trim().is_empty() => { + yield Ok(Chunk::new("```".to_string(), Metadata::Text { line_number })); + }, + + Some(lang) => { + yield Ok(Chunk::new(format!("```{}", lang.trim()), Metadata::Text { line_number })); + }, + + None => { + yield Ok(Chunk::new("```".to_string(), Metadata::Text { line_number })); + } + }; + } + while let Ok(Some(line)) = lines.next_line().await { line_number += 1; yield Ok(Chunk::new( @@ -189,6 +231,10 @@ async fn stream_text_file(file_path: &str) -> Result { Metadata::Text { line_number } )); } + + if use_md_fences { + yield Ok(Chunk::new("```\n".to_string(), Metadata::Text { line_number })); + } }; Ok(Box::pin(stream)) @@ -251,7 +297,17 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result { } }; - for (row_idx, row) in range.rows().enumerate() { + let mut row_idx = 0; + tx.blocking_send(Ok(Chunk::new( + "```csv".to_string(), + Metadata::Spreadsheet { + sheet_name: sheet_name.clone(), + row_number: row_idx, + } + ))).ok(); + + for row in range.rows() { + row_idx += 1; let content = row.iter() .map(|cell| cell.to_string()) .collect::>() @@ -261,12 +317,20 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result { content, Metadata::Spreadsheet { sheet_name: sheet_name.clone(), - row_number: row_idx + 1, + row_number: row_idx, } ))).is_err() { return; } } + + tx.blocking_send(Ok(Chunk::new( + "```".to_string(), + Metadata::Spreadsheet { + sheet_name: sheet_name.clone(), + row_number: row_idx, + } + ))).ok(); } }); @@ -319,11 +383,11 @@ async fn chunk_image(file_path: &str) -> Result { Ok(Box::pin(stream)) } -async fn stream_pptx(file_path: &str) -> Result { +async fn stream_pptx(file_path: &str, extract_images: bool) -> Result { let path = Path::new(file_path).to_owned(); let parser_config = ParserConfig::builder() - .extract_images(true) + .extract_images(extract_images) .compress_images(true) .quality(75) .image_handling_mode(ImageHandlingMode::Manually) @@ -356,7 +420,6 @@ async fn stream_pptx(file_path: &str) -> Result { if let Some(images) = slide.load_images_manually() { for image in images.iter() { let base64_data = &image.base64_content; - let total_length = base64_data.len(); let mut offset = 0; let mut segment_index = 0;