Finished the function for importing arbitrary documents (#516)

This commit is contained in:
Thorsten Sommer 2025-06-30 21:51:02 +02:00 committed by GitHub
parent aaedf667fe
commit 68f5bb1512
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 123 additions and 47 deletions

View File

@ -38,8 +38,7 @@ public partial class ReadFileContent : MSGComponentBase
return; return;
} }
var streamId = Guid.NewGuid().ToString(); var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, int.MaxValue);
var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, streamId, int.MaxValue);
await this.FileContentChanged.InvokeAsync(fileContent); await this.FileContentChanged.InvokeAsync(fileContent);
} }
} }

View File

@ -8,7 +8,7 @@ public static class ContentStreamSseHandler
private static readonly ConcurrentDictionary<string, List<ContentStreamPptxImageData>> CHUNKED_IMAGES = new(); private static readonly ConcurrentDictionary<string, List<ContentStreamPptxImageData>> CHUNKED_IMAGES = new();
private static readonly ConcurrentDictionary<string, int> CURRENT_SLIDE_NUMBERS = new(); private static readonly ConcurrentDictionary<string, int> CURRENT_SLIDE_NUMBERS = new();
public static string ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true) public static string? ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true)
{ {
switch (sseEvent) switch (sseEvent)
{ {
@ -16,40 +16,50 @@ public static class ContentStreamSseHandler
switch (sseEvent.Metadata) switch (sseEvent.Metadata)
{ {
case ContentStreamTextMetadata: case ContentStreamTextMetadata:
return $"{sseEvent.Content}\n"; return sseEvent.Content;
case ContentStreamPdfMetadata pdfMetadata: case ContentStreamPdfMetadata pdfMetadata:
var pageNumber = pdfMetadata.Pdf?.PageNumber ?? 0; var pageNumber = pdfMetadata.Pdf?.PageNumber ?? 0;
return $"# Page {pageNumber}\n{sseEvent.Content}"; return $"""
# Page {pageNumber}
{sseEvent.Content}
""";
case ContentStreamSpreadsheetMetadata spreadsheetMetadata: case ContentStreamSpreadsheetMetadata spreadsheetMetadata:
var sheetName = spreadsheetMetadata.Spreadsheet?.SheetName; var sheetName = spreadsheetMetadata.Spreadsheet?.SheetName;
var rowNumber = spreadsheetMetadata.Spreadsheet?.RowNumber; var rowNumber = spreadsheetMetadata.Spreadsheet?.RowNumber;
var spreadSheetResult = new StringBuilder(); var spreadSheetResult = new StringBuilder();
if (rowNumber == 1) if (rowNumber == 0)
spreadSheetResult.AppendLine($"\n# {sheetName}"); {
spreadSheetResult.AppendLine();
spreadSheetResult.AppendLine($"{sseEvent.Content}"); spreadSheetResult.AppendLine($"# {sheetName}");
}
spreadSheetResult.Append(sseEvent.Content);
return spreadSheetResult.ToString(); return spreadSheetResult.ToString();
case ContentStreamDocumentMetadata: case ContentStreamDocumentMetadata:
case ContentStreamImageMetadata: case ContentStreamImageMetadata:
return $"{sseEvent.Content}"; return sseEvent.Content;
case ContentStreamPresentationMetadata presentationMetadata: case ContentStreamPresentationMetadata presentationMetadata:
var slideNumber = presentationMetadata.Presentation?.SlideNumber ?? 0; var slideNumber = presentationMetadata.Presentation?.SlideNumber ?? 0;
var image = presentationMetadata.Presentation?.Image ?? null; var image = presentationMetadata.Presentation?.Image ?? null;
var presentationResult = new StringBuilder(); var presentationResult = new StringBuilder();
var streamId = sseEvent.StreamId; var streamId = sseEvent.StreamId;
CURRENT_SLIDE_NUMBERS.TryGetValue(streamId!, out var currentSlideNumber); CURRENT_SLIDE_NUMBERS.TryGetValue(streamId!, out var currentSlideNumber);
if (slideNumber != currentSlideNumber) if (slideNumber != currentSlideNumber)
{
presentationResult.AppendLine();
presentationResult.AppendLine($"# Slide {slideNumber}"); presentationResult.AppendLine($"# Slide {slideNumber}");
}
presentationResult.Append($"{sseEvent.Content}"); if(!string.IsNullOrWhiteSpace(sseEvent.Content))
presentationResult.AppendLine(sseEvent.Content);
if (image is not null)
if (extractImages && image is not null)
{ {
var imageId = $"{streamId}-{image.Id!}"; var imageId = $"{streamId}-{image.Id!}";
var isEnd = ProcessImageSegment(imageId, image); var isEnd = ProcessImageSegment(imageId, image);
@ -58,8 +68,8 @@ public static class ContentStreamSseHandler
} }
CURRENT_SLIDE_NUMBERS[streamId!] = slideNumber; CURRENT_SLIDE_NUMBERS[streamId!] = slideNumber;
return presentationResult.Length is 0 ? null : presentationResult.ToString();
return presentationResult.ToString();
default: default:
return sseEvent.Content; return sseEvent.Content;
} }
@ -68,7 +78,7 @@ public static class ContentStreamSseHandler
return sseEvent.Content; return sseEvent.Content;
default: default:
return string.Empty; return null;
} }
} }

View File

@ -5,9 +5,10 @@ namespace AIStudio.Tools.Services;
public sealed partial class RustService public sealed partial class RustService
{ {
public async Task<string> ReadArbitraryFileData(string path, string streamId, int maxChunks) public async Task<string> ReadArbitraryFileData(string path, int maxChunks, bool extractImages = false)
{ {
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}"; var streamId = Guid.NewGuid().ToString();
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}&extract_images={extractImages}";
var request = new HttpRequestMessage(HttpMethod.Get, requestUri); var request = new HttpRequestMessage(HttpMethod.Get, requestUri);
var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead); var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
@ -36,8 +37,10 @@ public sealed partial class RustService
var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent); var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent);
if (sseEvent is not null) if (sseEvent is not null)
{ {
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, false); var content = ContentStreamSseHandler.ProcessEvent(sseEvent, extractImages);
resultBuilder.Append(content); if(content is not null)
resultBuilder.AppendLine(content);
chunkCount++; chunkCount++;
} }
} }

View File

@ -1,5 +1,6 @@
# v0.9.49, build 224 (2025-06-xx xx:xx UTC) # v0.9.49, build 224 (2025-06-xx xx:xx UTC)
- Added a library by Nils Kruthoff (`nilskruthoff`) that allows AI Studio to read PowerPoint files. This feature is not yet available in the UI, but it will soon be available. Thanks, Nils, for that great contribution. - Added a library by Nils Kruthoff (`nilskruthoff`) that allows AI Studio to read PowerPoint files. Thanks, Nils, for that great contribution.
- Added support for loading arbitrary document data into some assistants. This functionality replaces the previous PDF reading function and was contributed by Nils as well.
- Improved the loading of some components that require data fetching, resulting in a more responsive UI. - Improved the loading of some components that require data fetching, resulting in a more responsive UI.
- Improved some awkward phrasings in English and German. - Improved some awkward phrasings in English and German.
- Improved the implementation of configuration plugins to enhance long-term maintainability. - Improved the implementation of configuration plugins to enhance long-term maintainability.

View File

@ -16,6 +16,7 @@ use rocket::tokio::select;
use rocket::Shutdown; use rocket::Shutdown;
use std::path::Path; use std::path::Path;
use std::pin::Pin; use std::pin::Pin;
use log::{debug, error};
use tokio::io::AsyncBufReadExt; use tokio::io::AsyncBufReadExt;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream; use tokio_stream::wrappers::ReceiverStream;
@ -80,10 +81,10 @@ const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>; type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>; type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
#[get("/retrieval/fs/extract?<path>&<stream_id>")] #[get("/retrieval/fs/extract?<path>&<stream_id>&<extract_images>")]
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut end: Shutdown) -> EventStream![] { pub async fn extract_data(_token: APIToken, path: String, stream_id: String, extract_images: bool, mut end: Shutdown) -> EventStream![] {
EventStream! { EventStream! {
let stream_result = stream_data(&path).await; let stream_result = stream_data(&path, extract_images).await;
let id_ref = &stream_id; let id_ref = &stream_id;
match stream_result { match stream_result {
@ -115,24 +116,35 @@ pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut
} }
} }
async fn stream_data(file_path: &str) -> Result<ChunkStream> { async fn stream_data(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
if !Path::new(file_path).exists() { if !Path::new(file_path).exists() {
error!("File does not exist: '{file_path}'");
return Err("File does not exist.".into()); return Err("File does not exist.".into());
} }
let file_path_clone = file_path.to_owned(); let file_path_clone = file_path.to_owned();
let fmt = tokio::task::spawn_blocking(move || { let fmt = match FileFormat::from_file(&file_path_clone) {
FileFormat::from_file(&file_path_clone) Ok(format) => format,
}).await??; Err(error) => {
error!("Failed to determine file format for '{file_path}': {error}");
return Err(format!("Failed to determine file format for '{file_path}': {error}").into());
},
};
let ext = file_path.split('.').next_back().unwrap_or(""); let ext = file_path.split('.').next_back().unwrap_or("");
debug!("Extracting data from file: '{file_path}', format: '{fmt:?}', extension: '{ext}'");
let stream = match ext { let stream = match ext {
DOCX | ODT => { DOCX | ODT => {
let from = if ext == DOCX { "docx" } else { "odt" }; let from = if ext == DOCX { "docx" } else { "odt" };
convert_with_pandoc(file_path, from, TO_MARKDOWN).await? convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
} }
"pptx" => stream_pptx(file_path).await?, "csv" | "tsv" => {
stream_text_file(file_path, true, Some("csv".to_string())).await?
},
"pptx" => stream_pptx(file_path, extract_images).await?,
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => { "xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
stream_spreadsheet_as_csv(file_path).await? stream_spreadsheet_as_csv(file_path).await?
@ -141,47 +153,77 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
_ => match fmt.kind() { _ => match fmt.kind() {
Kind::Document => match fmt { Kind::Document => match fmt {
FileFormat::PortableDocumentFormat => stream_pdf(file_path).await?, FileFormat::PortableDocumentFormat => stream_pdf(file_path).await?,
FileFormat::MicrosoftWordDocument => { FileFormat::MicrosoftWordDocument => {
convert_with_pandoc(file_path, "docx", TO_MARKDOWN).await? convert_with_pandoc(file_path, "docx", TO_MARKDOWN).await?
} },
FileFormat::OfficeOpenXmlDocument => { FileFormat::OfficeOpenXmlDocument => {
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await? convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
} },
_ => stream_text_file(file_path).await?,
_ => stream_text_file(file_path, false, None).await?,
}, },
Kind::Ebook => return Err("Ebooks not yet supported".into()), Kind::Ebook => return Err("Ebooks not yet supported".into()),
Kind::Image => chunk_image(file_path).await?,
Kind::Image => {
if !extract_images {
return Err("Image extraction is disabled.".into());
}
chunk_image(file_path).await?
},
Kind::Other => match fmt { Kind::Other => match fmt {
FileFormat::HypertextMarkupLanguage => { FileFormat::HypertextMarkupLanguage => {
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await? convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
} },
_ => stream_text_file(file_path).await?,
_ => stream_text_file(file_path, false, None).await?,
}, },
Kind::Presentation => match fmt { Kind::Presentation => match fmt {
FileFormat::OfficeOpenXmlPresentation => { FileFormat::OfficeOpenXmlPresentation => {
stream_pptx(file_path).await? stream_pptx(file_path, extract_images).await?
} },
_ => stream_text_file(file_path).await?,
_ => stream_text_file(file_path, false, None).await?,
}, },
Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?, Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?,
_ => stream_text_file(file_path).await?,
_ => stream_text_file(file_path, false, None).await?,
}, },
}; };
Ok(Box::pin(stream)) Ok(Box::pin(stream))
} }
async fn stream_text_file(file_path: &str) -> Result<ChunkStream> { async fn stream_text_file(file_path: &str, use_md_fences: bool, fence_language: Option<String>) -> Result<ChunkStream> {
let file = tokio::fs::File::open(file_path).await?; let file = tokio::fs::File::open(file_path).await?;
let reader = tokio::io::BufReader::new(file); let reader = tokio::io::BufReader::new(file);
let mut lines = reader.lines(); let mut lines = reader.lines();
let mut line_number = 0; let mut line_number = 0;
let stream = stream! { let stream = stream! {
if use_md_fences {
match fence_language {
Some(lang) if lang.trim().is_empty() => {
yield Ok(Chunk::new("```".to_string(), Metadata::Text { line_number }));
},
Some(lang) => {
yield Ok(Chunk::new(format!("```{}", lang.trim()), Metadata::Text { line_number }));
},
None => {
yield Ok(Chunk::new("```".to_string(), Metadata::Text { line_number }));
}
};
}
while let Ok(Some(line)) = lines.next_line().await { while let Ok(Some(line)) = lines.next_line().await {
line_number += 1; line_number += 1;
yield Ok(Chunk::new( yield Ok(Chunk::new(
@ -189,6 +231,10 @@ async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
Metadata::Text { line_number } Metadata::Text { line_number }
)); ));
} }
if use_md_fences {
yield Ok(Chunk::new("```\n".to_string(), Metadata::Text { line_number }));
}
}; };
Ok(Box::pin(stream)) Ok(Box::pin(stream))
@ -251,7 +297,17 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result<ChunkStream> {
} }
}; };
for (row_idx, row) in range.rows().enumerate() { let mut row_idx = 0;
tx.blocking_send(Ok(Chunk::new(
"```csv".to_string(),
Metadata::Spreadsheet {
sheet_name: sheet_name.clone(),
row_number: row_idx,
}
))).ok();
for row in range.rows() {
row_idx += 1;
let content = row.iter() let content = row.iter()
.map(|cell| cell.to_string()) .map(|cell| cell.to_string())
.collect::<Vec<_>>() .collect::<Vec<_>>()
@ -261,12 +317,20 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result<ChunkStream> {
content, content,
Metadata::Spreadsheet { Metadata::Spreadsheet {
sheet_name: sheet_name.clone(), sheet_name: sheet_name.clone(),
row_number: row_idx + 1, row_number: row_idx,
} }
))).is_err() { ))).is_err() {
return; return;
} }
} }
tx.blocking_send(Ok(Chunk::new(
"```".to_string(),
Metadata::Spreadsheet {
sheet_name: sheet_name.clone(),
row_number: row_idx,
}
))).ok();
} }
}); });
@ -319,11 +383,11 @@ async fn chunk_image(file_path: &str) -> Result<ChunkStream> {
Ok(Box::pin(stream)) Ok(Box::pin(stream))
} }
async fn stream_pptx(file_path: &str) -> Result<ChunkStream> { async fn stream_pptx(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
let path = Path::new(file_path).to_owned(); let path = Path::new(file_path).to_owned();
let parser_config = ParserConfig::builder() let parser_config = ParserConfig::builder()
.extract_images(true) .extract_images(extract_images)
.compress_images(true) .compress_images(true)
.quality(75) .quality(75)
.image_handling_mode(ImageHandlingMode::Manually) .image_handling_mode(ImageHandlingMode::Manually)
@ -356,7 +420,6 @@ async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
if let Some(images) = slide.load_images_manually() { if let Some(images) = slide.load_images_manually() {
for image in images.iter() { for image in images.iter() {
let base64_data = &image.base64_content; let base64_data = &image.base64_content;
let total_length = base64_data.len(); let total_length = base64_data.len();
let mut offset = 0; let mut offset = 0;
let mut segment_index = 0; let mut segment_index = 0;