mirror of
https://github.com/MindWorkAI/AI-Studio.git
synced 2025-07-04 03:42:56 +00:00
Finished the function for importing arbitrary documents (#516)
This commit is contained in:
parent
aaedf667fe
commit
68f5bb1512
@ -38,8 +38,7 @@ public partial class ReadFileContent : MSGComponentBase
|
||||
return;
|
||||
}
|
||||
|
||||
var streamId = Guid.NewGuid().ToString();
|
||||
var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, streamId, int.MaxValue);
|
||||
var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, int.MaxValue);
|
||||
await this.FileContentChanged.InvokeAsync(fileContent);
|
||||
}
|
||||
}
|
@ -8,7 +8,7 @@ public static class ContentStreamSseHandler
|
||||
private static readonly ConcurrentDictionary<string, List<ContentStreamPptxImageData>> CHUNKED_IMAGES = new();
|
||||
private static readonly ConcurrentDictionary<string, int> CURRENT_SLIDE_NUMBERS = new();
|
||||
|
||||
public static string ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true)
|
||||
public static string? ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true)
|
||||
{
|
||||
switch (sseEvent)
|
||||
{
|
||||
@ -16,25 +16,32 @@ public static class ContentStreamSseHandler
|
||||
switch (sseEvent.Metadata)
|
||||
{
|
||||
case ContentStreamTextMetadata:
|
||||
return $"{sseEvent.Content}\n";
|
||||
return sseEvent.Content;
|
||||
|
||||
case ContentStreamPdfMetadata pdfMetadata:
|
||||
var pageNumber = pdfMetadata.Pdf?.PageNumber ?? 0;
|
||||
return $"# Page {pageNumber}\n{sseEvent.Content}";
|
||||
return $"""
|
||||
# Page {pageNumber}
|
||||
{sseEvent.Content}
|
||||
|
||||
""";
|
||||
|
||||
case ContentStreamSpreadsheetMetadata spreadsheetMetadata:
|
||||
var sheetName = spreadsheetMetadata.Spreadsheet?.SheetName;
|
||||
var rowNumber = spreadsheetMetadata.Spreadsheet?.RowNumber;
|
||||
var spreadSheetResult = new StringBuilder();
|
||||
if (rowNumber == 1)
|
||||
spreadSheetResult.AppendLine($"\n# {sheetName}");
|
||||
if (rowNumber == 0)
|
||||
{
|
||||
spreadSheetResult.AppendLine();
|
||||
spreadSheetResult.AppendLine($"# {sheetName}");
|
||||
}
|
||||
|
||||
spreadSheetResult.AppendLine($"{sseEvent.Content}");
|
||||
spreadSheetResult.Append(sseEvent.Content);
|
||||
return spreadSheetResult.ToString();
|
||||
|
||||
case ContentStreamDocumentMetadata:
|
||||
case ContentStreamImageMetadata:
|
||||
return $"{sseEvent.Content}";
|
||||
return sseEvent.Content;
|
||||
|
||||
case ContentStreamPresentationMetadata presentationMetadata:
|
||||
var slideNumber = presentationMetadata.Presentation?.SlideNumber ?? 0;
|
||||
@ -43,13 +50,16 @@ public static class ContentStreamSseHandler
|
||||
var streamId = sseEvent.StreamId;
|
||||
|
||||
CURRENT_SLIDE_NUMBERS.TryGetValue(streamId!, out var currentSlideNumber);
|
||||
|
||||
if (slideNumber != currentSlideNumber)
|
||||
{
|
||||
presentationResult.AppendLine();
|
||||
presentationResult.AppendLine($"# Slide {slideNumber}");
|
||||
}
|
||||
|
||||
presentationResult.Append($"{sseEvent.Content}");
|
||||
if(!string.IsNullOrWhiteSpace(sseEvent.Content))
|
||||
presentationResult.AppendLine(sseEvent.Content);
|
||||
|
||||
if (image is not null)
|
||||
if (extractImages && image is not null)
|
||||
{
|
||||
var imageId = $"{streamId}-{image.Id!}";
|
||||
var isEnd = ProcessImageSegment(imageId, image);
|
||||
@ -58,8 +68,8 @@ public static class ContentStreamSseHandler
|
||||
}
|
||||
|
||||
CURRENT_SLIDE_NUMBERS[streamId!] = slideNumber;
|
||||
return presentationResult.Length is 0 ? null : presentationResult.ToString();
|
||||
|
||||
return presentationResult.ToString();
|
||||
default:
|
||||
return sseEvent.Content;
|
||||
}
|
||||
@ -68,7 +78,7 @@ public static class ContentStreamSseHandler
|
||||
return sseEvent.Content;
|
||||
|
||||
default:
|
||||
return string.Empty;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5,9 +5,10 @@ namespace AIStudio.Tools.Services;
|
||||
|
||||
public sealed partial class RustService
|
||||
{
|
||||
public async Task<string> ReadArbitraryFileData(string path, string streamId, int maxChunks)
|
||||
public async Task<string> ReadArbitraryFileData(string path, int maxChunks, bool extractImages = false)
|
||||
{
|
||||
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}";
|
||||
var streamId = Guid.NewGuid().ToString();
|
||||
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}&extract_images={extractImages}";
|
||||
var request = new HttpRequestMessage(HttpMethod.Get, requestUri);
|
||||
var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
||||
|
||||
@ -36,8 +37,10 @@ public sealed partial class RustService
|
||||
var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent);
|
||||
if (sseEvent is not null)
|
||||
{
|
||||
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, false);
|
||||
resultBuilder.Append(content);
|
||||
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, extractImages);
|
||||
if(content is not null)
|
||||
resultBuilder.AppendLine(content);
|
||||
|
||||
chunkCount++;
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
# v0.9.49, build 224 (2025-06-xx xx:xx UTC)
|
||||
- Added a library by Nils Kruthoff (`nilskruthoff`) that allows AI Studio to read PowerPoint files. This feature is not yet available in the UI, but it will soon be available. Thanks, Nils, for that great contribution.
|
||||
- Added a library by Nils Kruthoff (`nilskruthoff`) that allows AI Studio to read PowerPoint files. Thanks, Nils, for that great contribution.
|
||||
- Added support for loading arbitrary document data into some assistants. This functionality replaces the previous PDF reading function and was contributed by Nils as well.
|
||||
- Improved the loading of some components that require data fetching, resulting in a more responsive UI.
|
||||
- Improved some awkward phrasings in English and German.
|
||||
- Improved the implementation of configuration plugins to enhance long-term maintainability.
|
||||
|
@ -16,6 +16,7 @@ use rocket::tokio::select;
|
||||
use rocket::Shutdown;
|
||||
use std::path::Path;
|
||||
use std::pin::Pin;
|
||||
use log::{debug, error};
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
@ -80,10 +81,10 @@ const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token
|
||||
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
||||
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
||||
|
||||
#[get("/retrieval/fs/extract?<path>&<stream_id>")]
|
||||
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut end: Shutdown) -> EventStream![] {
|
||||
#[get("/retrieval/fs/extract?<path>&<stream_id>&<extract_images>")]
|
||||
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, extract_images: bool, mut end: Shutdown) -> EventStream![] {
|
||||
EventStream! {
|
||||
let stream_result = stream_data(&path).await;
|
||||
let stream_result = stream_data(&path, extract_images).await;
|
||||
let id_ref = &stream_id;
|
||||
|
||||
match stream_result {
|
||||
@ -115,24 +116,35 @@ pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut
|
||||
}
|
||||
}
|
||||
|
||||
async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
||||
async fn stream_data(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
|
||||
if !Path::new(file_path).exists() {
|
||||
error!("File does not exist: '{file_path}'");
|
||||
return Err("File does not exist.".into());
|
||||
}
|
||||
|
||||
let file_path_clone = file_path.to_owned();
|
||||
let fmt = tokio::task::spawn_blocking(move || {
|
||||
FileFormat::from_file(&file_path_clone)
|
||||
}).await??;
|
||||
let fmt = match FileFormat::from_file(&file_path_clone) {
|
||||
Ok(format) => format,
|
||||
Err(error) => {
|
||||
error!("Failed to determine file format for '{file_path}': {error}");
|
||||
return Err(format!("Failed to determine file format for '{file_path}': {error}").into());
|
||||
},
|
||||
};
|
||||
|
||||
let ext = file_path.split('.').next_back().unwrap_or("");
|
||||
debug!("Extracting data from file: '{file_path}', format: '{fmt:?}', extension: '{ext}'");
|
||||
|
||||
let stream = match ext {
|
||||
DOCX | ODT => {
|
||||
let from = if ext == DOCX { "docx" } else { "odt" };
|
||||
convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
|
||||
}
|
||||
|
||||
"pptx" => stream_pptx(file_path).await?,
|
||||
"csv" | "tsv" => {
|
||||
stream_text_file(file_path, true, Some("csv".to_string())).await?
|
||||
},
|
||||
|
||||
"pptx" => stream_pptx(file_path, extract_images).await?,
|
||||
|
||||
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
|
||||
stream_spreadsheet_as_csv(file_path).await?
|
||||
@ -141,47 +153,77 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
||||
_ => match fmt.kind() {
|
||||
Kind::Document => match fmt {
|
||||
FileFormat::PortableDocumentFormat => stream_pdf(file_path).await?,
|
||||
|
||||
FileFormat::MicrosoftWordDocument => {
|
||||
convert_with_pandoc(file_path, "docx", TO_MARKDOWN).await?
|
||||
}
|
||||
},
|
||||
|
||||
FileFormat::OfficeOpenXmlDocument => {
|
||||
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
||||
}
|
||||
_ => stream_text_file(file_path).await?,
|
||||
},
|
||||
|
||||
_ => stream_text_file(file_path, false, None).await?,
|
||||
},
|
||||
|
||||
Kind::Ebook => return Err("Ebooks not yet supported".into()),
|
||||
Kind::Image => chunk_image(file_path).await?,
|
||||
|
||||
Kind::Image => {
|
||||
if !extract_images {
|
||||
return Err("Image extraction is disabled.".into());
|
||||
}
|
||||
|
||||
chunk_image(file_path).await?
|
||||
},
|
||||
|
||||
Kind::Other => match fmt {
|
||||
FileFormat::HypertextMarkupLanguage => {
|
||||
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
||||
}
|
||||
_ => stream_text_file(file_path).await?,
|
||||
},
|
||||
|
||||
_ => stream_text_file(file_path, false, None).await?,
|
||||
},
|
||||
|
||||
Kind::Presentation => match fmt {
|
||||
FileFormat::OfficeOpenXmlPresentation => {
|
||||
stream_pptx(file_path).await?
|
||||
}
|
||||
_ => stream_text_file(file_path).await?,
|
||||
stream_pptx(file_path, extract_images).await?
|
||||
},
|
||||
|
||||
_ => stream_text_file(file_path, false, None).await?,
|
||||
},
|
||||
|
||||
Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?,
|
||||
_ => stream_text_file(file_path).await?,
|
||||
|
||||
_ => stream_text_file(file_path, false, None).await?,
|
||||
},
|
||||
};
|
||||
|
||||
Ok(Box::pin(stream))
|
||||
}
|
||||
|
||||
async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
|
||||
async fn stream_text_file(file_path: &str, use_md_fences: bool, fence_language: Option<String>) -> Result<ChunkStream> {
|
||||
let file = tokio::fs::File::open(file_path).await?;
|
||||
let reader = tokio::io::BufReader::new(file);
|
||||
let mut lines = reader.lines();
|
||||
let mut line_number = 0;
|
||||
|
||||
let stream = stream! {
|
||||
|
||||
if use_md_fences {
|
||||
match fence_language {
|
||||
Some(lang) if lang.trim().is_empty() => {
|
||||
yield Ok(Chunk::new("```".to_string(), Metadata::Text { line_number }));
|
||||
},
|
||||
|
||||
Some(lang) => {
|
||||
yield Ok(Chunk::new(format!("```{}", lang.trim()), Metadata::Text { line_number }));
|
||||
},
|
||||
|
||||
None => {
|
||||
yield Ok(Chunk::new("```".to_string(), Metadata::Text { line_number }));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
line_number += 1;
|
||||
yield Ok(Chunk::new(
|
||||
@ -189,6 +231,10 @@ async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
|
||||
Metadata::Text { line_number }
|
||||
));
|
||||
}
|
||||
|
||||
if use_md_fences {
|
||||
yield Ok(Chunk::new("```\n".to_string(), Metadata::Text { line_number }));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Box::pin(stream))
|
||||
@ -251,7 +297,17 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result<ChunkStream> {
|
||||
}
|
||||
};
|
||||
|
||||
for (row_idx, row) in range.rows().enumerate() {
|
||||
let mut row_idx = 0;
|
||||
tx.blocking_send(Ok(Chunk::new(
|
||||
"```csv".to_string(),
|
||||
Metadata::Spreadsheet {
|
||||
sheet_name: sheet_name.clone(),
|
||||
row_number: row_idx,
|
||||
}
|
||||
))).ok();
|
||||
|
||||
for row in range.rows() {
|
||||
row_idx += 1;
|
||||
let content = row.iter()
|
||||
.map(|cell| cell.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
@ -261,12 +317,20 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result<ChunkStream> {
|
||||
content,
|
||||
Metadata::Spreadsheet {
|
||||
sheet_name: sheet_name.clone(),
|
||||
row_number: row_idx + 1,
|
||||
row_number: row_idx,
|
||||
}
|
||||
))).is_err() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
tx.blocking_send(Ok(Chunk::new(
|
||||
"```".to_string(),
|
||||
Metadata::Spreadsheet {
|
||||
sheet_name: sheet_name.clone(),
|
||||
row_number: row_idx,
|
||||
}
|
||||
))).ok();
|
||||
}
|
||||
});
|
||||
|
||||
@ -319,11 +383,11 @@ async fn chunk_image(file_path: &str) -> Result<ChunkStream> {
|
||||
Ok(Box::pin(stream))
|
||||
}
|
||||
|
||||
async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
||||
async fn stream_pptx(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
|
||||
let path = Path::new(file_path).to_owned();
|
||||
|
||||
let parser_config = ParserConfig::builder()
|
||||
.extract_images(true)
|
||||
.extract_images(extract_images)
|
||||
.compress_images(true)
|
||||
.quality(75)
|
||||
.image_handling_mode(ImageHandlingMode::Manually)
|
||||
@ -356,7 +420,6 @@ async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
||||
if let Some(images) = slide.load_images_manually() {
|
||||
for image in images.iter() {
|
||||
let base64_data = &image.base64_content;
|
||||
|
||||
let total_length = base64_data.len();
|
||||
let mut offset = 0;
|
||||
let mut segment_index = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user