mirror of
https://github.com/MindWorkAI/AI-Studio.git
synced 2025-07-04 15:42:56 +00:00
Finished the function for importing arbitrary documents (#516)
This commit is contained in:
parent
aaedf667fe
commit
68f5bb1512
@ -38,8 +38,7 @@ public partial class ReadFileContent : MSGComponentBase
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var streamId = Guid.NewGuid().ToString();
|
var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, int.MaxValue);
|
||||||
var fileContent = await this.RustService.ReadArbitraryFileData(selectedFile.SelectedFilePath, streamId, int.MaxValue);
|
|
||||||
await this.FileContentChanged.InvokeAsync(fileContent);
|
await this.FileContentChanged.InvokeAsync(fileContent);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -8,7 +8,7 @@ public static class ContentStreamSseHandler
|
|||||||
private static readonly ConcurrentDictionary<string, List<ContentStreamPptxImageData>> CHUNKED_IMAGES = new();
|
private static readonly ConcurrentDictionary<string, List<ContentStreamPptxImageData>> CHUNKED_IMAGES = new();
|
||||||
private static readonly ConcurrentDictionary<string, int> CURRENT_SLIDE_NUMBERS = new();
|
private static readonly ConcurrentDictionary<string, int> CURRENT_SLIDE_NUMBERS = new();
|
||||||
|
|
||||||
public static string ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true)
|
public static string? ProcessEvent(ContentStreamSseEvent? sseEvent, bool extractImages = true)
|
||||||
{
|
{
|
||||||
switch (sseEvent)
|
switch (sseEvent)
|
||||||
{
|
{
|
||||||
@ -16,25 +16,32 @@ public static class ContentStreamSseHandler
|
|||||||
switch (sseEvent.Metadata)
|
switch (sseEvent.Metadata)
|
||||||
{
|
{
|
||||||
case ContentStreamTextMetadata:
|
case ContentStreamTextMetadata:
|
||||||
return $"{sseEvent.Content}\n";
|
return sseEvent.Content;
|
||||||
|
|
||||||
case ContentStreamPdfMetadata pdfMetadata:
|
case ContentStreamPdfMetadata pdfMetadata:
|
||||||
var pageNumber = pdfMetadata.Pdf?.PageNumber ?? 0;
|
var pageNumber = pdfMetadata.Pdf?.PageNumber ?? 0;
|
||||||
return $"# Page {pageNumber}\n{sseEvent.Content}";
|
return $"""
|
||||||
|
# Page {pageNumber}
|
||||||
|
{sseEvent.Content}
|
||||||
|
|
||||||
|
""";
|
||||||
|
|
||||||
case ContentStreamSpreadsheetMetadata spreadsheetMetadata:
|
case ContentStreamSpreadsheetMetadata spreadsheetMetadata:
|
||||||
var sheetName = spreadsheetMetadata.Spreadsheet?.SheetName;
|
var sheetName = spreadsheetMetadata.Spreadsheet?.SheetName;
|
||||||
var rowNumber = spreadsheetMetadata.Spreadsheet?.RowNumber;
|
var rowNumber = spreadsheetMetadata.Spreadsheet?.RowNumber;
|
||||||
var spreadSheetResult = new StringBuilder();
|
var spreadSheetResult = new StringBuilder();
|
||||||
if (rowNumber == 1)
|
if (rowNumber == 0)
|
||||||
spreadSheetResult.AppendLine($"\n# {sheetName}");
|
{
|
||||||
|
spreadSheetResult.AppendLine();
|
||||||
|
spreadSheetResult.AppendLine($"# {sheetName}");
|
||||||
|
}
|
||||||
|
|
||||||
spreadSheetResult.AppendLine($"{sseEvent.Content}");
|
spreadSheetResult.Append(sseEvent.Content);
|
||||||
return spreadSheetResult.ToString();
|
return spreadSheetResult.ToString();
|
||||||
|
|
||||||
case ContentStreamDocumentMetadata:
|
case ContentStreamDocumentMetadata:
|
||||||
case ContentStreamImageMetadata:
|
case ContentStreamImageMetadata:
|
||||||
return $"{sseEvent.Content}";
|
return sseEvent.Content;
|
||||||
|
|
||||||
case ContentStreamPresentationMetadata presentationMetadata:
|
case ContentStreamPresentationMetadata presentationMetadata:
|
||||||
var slideNumber = presentationMetadata.Presentation?.SlideNumber ?? 0;
|
var slideNumber = presentationMetadata.Presentation?.SlideNumber ?? 0;
|
||||||
@ -43,13 +50,16 @@ public static class ContentStreamSseHandler
|
|||||||
var streamId = sseEvent.StreamId;
|
var streamId = sseEvent.StreamId;
|
||||||
|
|
||||||
CURRENT_SLIDE_NUMBERS.TryGetValue(streamId!, out var currentSlideNumber);
|
CURRENT_SLIDE_NUMBERS.TryGetValue(streamId!, out var currentSlideNumber);
|
||||||
|
|
||||||
if (slideNumber != currentSlideNumber)
|
if (slideNumber != currentSlideNumber)
|
||||||
|
{
|
||||||
|
presentationResult.AppendLine();
|
||||||
presentationResult.AppendLine($"# Slide {slideNumber}");
|
presentationResult.AppendLine($"# Slide {slideNumber}");
|
||||||
|
}
|
||||||
|
|
||||||
presentationResult.Append($"{sseEvent.Content}");
|
if(!string.IsNullOrWhiteSpace(sseEvent.Content))
|
||||||
|
presentationResult.AppendLine(sseEvent.Content);
|
||||||
|
|
||||||
if (image is not null)
|
if (extractImages && image is not null)
|
||||||
{
|
{
|
||||||
var imageId = $"{streamId}-{image.Id!}";
|
var imageId = $"{streamId}-{image.Id!}";
|
||||||
var isEnd = ProcessImageSegment(imageId, image);
|
var isEnd = ProcessImageSegment(imageId, image);
|
||||||
@ -58,8 +68,8 @@ public static class ContentStreamSseHandler
|
|||||||
}
|
}
|
||||||
|
|
||||||
CURRENT_SLIDE_NUMBERS[streamId!] = slideNumber;
|
CURRENT_SLIDE_NUMBERS[streamId!] = slideNumber;
|
||||||
|
return presentationResult.Length is 0 ? null : presentationResult.ToString();
|
||||||
|
|
||||||
return presentationResult.ToString();
|
|
||||||
default:
|
default:
|
||||||
return sseEvent.Content;
|
return sseEvent.Content;
|
||||||
}
|
}
|
||||||
@ -68,7 +78,7 @@ public static class ContentStreamSseHandler
|
|||||||
return sseEvent.Content;
|
return sseEvent.Content;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return string.Empty;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,9 +5,10 @@ namespace AIStudio.Tools.Services;
|
|||||||
|
|
||||||
public sealed partial class RustService
|
public sealed partial class RustService
|
||||||
{
|
{
|
||||||
public async Task<string> ReadArbitraryFileData(string path, string streamId, int maxChunks)
|
public async Task<string> ReadArbitraryFileData(string path, int maxChunks, bool extractImages = false)
|
||||||
{
|
{
|
||||||
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}";
|
var streamId = Guid.NewGuid().ToString();
|
||||||
|
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}&extract_images={extractImages}";
|
||||||
var request = new HttpRequestMessage(HttpMethod.Get, requestUri);
|
var request = new HttpRequestMessage(HttpMethod.Get, requestUri);
|
||||||
var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
||||||
|
|
||||||
@ -36,8 +37,10 @@ public sealed partial class RustService
|
|||||||
var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent);
|
var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent);
|
||||||
if (sseEvent is not null)
|
if (sseEvent is not null)
|
||||||
{
|
{
|
||||||
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, false);
|
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, extractImages);
|
||||||
resultBuilder.Append(content);
|
if(content is not null)
|
||||||
|
resultBuilder.AppendLine(content);
|
||||||
|
|
||||||
chunkCount++;
|
chunkCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
# v0.9.49, build 224 (2025-06-xx xx:xx UTC)
|
# v0.9.49, build 224 (2025-06-xx xx:xx UTC)
|
||||||
- Added a library by Nils Kruthoff (`nilskruthoff`) that allows AI Studio to read PowerPoint files. This feature is not yet available in the UI, but it will soon be available. Thanks, Nils, for that great contribution.
|
- Added a library by Nils Kruthoff (`nilskruthoff`) that allows AI Studio to read PowerPoint files. Thanks, Nils, for that great contribution.
|
||||||
|
- Added support for loading arbitrary document data into some assistants. This functionality replaces the previous PDF reading function and was contributed by Nils as well.
|
||||||
- Improved the loading of some components that require data fetching, resulting in a more responsive UI.
|
- Improved the loading of some components that require data fetching, resulting in a more responsive UI.
|
||||||
- Improved some awkward phrasings in English and German.
|
- Improved some awkward phrasings in English and German.
|
||||||
- Improved the implementation of configuration plugins to enhance long-term maintainability.
|
- Improved the implementation of configuration plugins to enhance long-term maintainability.
|
||||||
|
@ -16,6 +16,7 @@ use rocket::tokio::select;
|
|||||||
use rocket::Shutdown;
|
use rocket::Shutdown;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
|
use log::{debug, error};
|
||||||
use tokio::io::AsyncBufReadExt;
|
use tokio::io::AsyncBufReadExt;
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
use tokio_stream::wrappers::ReceiverStream;
|
use tokio_stream::wrappers::ReceiverStream;
|
||||||
@ -80,10 +81,10 @@ const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token
|
|||||||
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
||||||
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
||||||
|
|
||||||
#[get("/retrieval/fs/extract?<path>&<stream_id>")]
|
#[get("/retrieval/fs/extract?<path>&<stream_id>&<extract_images>")]
|
||||||
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut end: Shutdown) -> EventStream![] {
|
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, extract_images: bool, mut end: Shutdown) -> EventStream![] {
|
||||||
EventStream! {
|
EventStream! {
|
||||||
let stream_result = stream_data(&path).await;
|
let stream_result = stream_data(&path, extract_images).await;
|
||||||
let id_ref = &stream_id;
|
let id_ref = &stream_id;
|
||||||
|
|
||||||
match stream_result {
|
match stream_result {
|
||||||
@ -115,24 +116,35 @@ pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
async fn stream_data(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
|
||||||
if !Path::new(file_path).exists() {
|
if !Path::new(file_path).exists() {
|
||||||
|
error!("File does not exist: '{file_path}'");
|
||||||
return Err("File does not exist.".into());
|
return Err("File does not exist.".into());
|
||||||
}
|
}
|
||||||
|
|
||||||
let file_path_clone = file_path.to_owned();
|
let file_path_clone = file_path.to_owned();
|
||||||
let fmt = tokio::task::spawn_blocking(move || {
|
let fmt = match FileFormat::from_file(&file_path_clone) {
|
||||||
FileFormat::from_file(&file_path_clone)
|
Ok(format) => format,
|
||||||
}).await??;
|
Err(error) => {
|
||||||
|
error!("Failed to determine file format for '{file_path}': {error}");
|
||||||
|
return Err(format!("Failed to determine file format for '{file_path}': {error}").into());
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
let ext = file_path.split('.').next_back().unwrap_or("");
|
let ext = file_path.split('.').next_back().unwrap_or("");
|
||||||
|
debug!("Extracting data from file: '{file_path}', format: '{fmt:?}', extension: '{ext}'");
|
||||||
|
|
||||||
let stream = match ext {
|
let stream = match ext {
|
||||||
DOCX | ODT => {
|
DOCX | ODT => {
|
||||||
let from = if ext == DOCX { "docx" } else { "odt" };
|
let from = if ext == DOCX { "docx" } else { "odt" };
|
||||||
convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
|
convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
|
||||||
}
|
}
|
||||||
|
|
||||||
"pptx" => stream_pptx(file_path).await?,
|
"csv" | "tsv" => {
|
||||||
|
stream_text_file(file_path, true, Some("csv".to_string())).await?
|
||||||
|
},
|
||||||
|
|
||||||
|
"pptx" => stream_pptx(file_path, extract_images).await?,
|
||||||
|
|
||||||
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
|
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
|
||||||
stream_spreadsheet_as_csv(file_path).await?
|
stream_spreadsheet_as_csv(file_path).await?
|
||||||
@ -141,47 +153,77 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
|||||||
_ => match fmt.kind() {
|
_ => match fmt.kind() {
|
||||||
Kind::Document => match fmt {
|
Kind::Document => match fmt {
|
||||||
FileFormat::PortableDocumentFormat => stream_pdf(file_path).await?,
|
FileFormat::PortableDocumentFormat => stream_pdf(file_path).await?,
|
||||||
|
|
||||||
FileFormat::MicrosoftWordDocument => {
|
FileFormat::MicrosoftWordDocument => {
|
||||||
convert_with_pandoc(file_path, "docx", TO_MARKDOWN).await?
|
convert_with_pandoc(file_path, "docx", TO_MARKDOWN).await?
|
||||||
}
|
},
|
||||||
|
|
||||||
FileFormat::OfficeOpenXmlDocument => {
|
FileFormat::OfficeOpenXmlDocument => {
|
||||||
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
||||||
}
|
},
|
||||||
_ => stream_text_file(file_path).await?,
|
|
||||||
|
_ => stream_text_file(file_path, false, None).await?,
|
||||||
},
|
},
|
||||||
|
|
||||||
Kind::Ebook => return Err("Ebooks not yet supported".into()),
|
Kind::Ebook => return Err("Ebooks not yet supported".into()),
|
||||||
Kind::Image => chunk_image(file_path).await?,
|
|
||||||
|
Kind::Image => {
|
||||||
|
if !extract_images {
|
||||||
|
return Err("Image extraction is disabled.".into());
|
||||||
|
}
|
||||||
|
|
||||||
|
chunk_image(file_path).await?
|
||||||
|
},
|
||||||
|
|
||||||
Kind::Other => match fmt {
|
Kind::Other => match fmt {
|
||||||
FileFormat::HypertextMarkupLanguage => {
|
FileFormat::HypertextMarkupLanguage => {
|
||||||
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
||||||
}
|
},
|
||||||
_ => stream_text_file(file_path).await?,
|
|
||||||
|
_ => stream_text_file(file_path, false, None).await?,
|
||||||
},
|
},
|
||||||
|
|
||||||
Kind::Presentation => match fmt {
|
Kind::Presentation => match fmt {
|
||||||
FileFormat::OfficeOpenXmlPresentation => {
|
FileFormat::OfficeOpenXmlPresentation => {
|
||||||
stream_pptx(file_path).await?
|
stream_pptx(file_path, extract_images).await?
|
||||||
}
|
},
|
||||||
_ => stream_text_file(file_path).await?,
|
|
||||||
|
_ => stream_text_file(file_path, false, None).await?,
|
||||||
},
|
},
|
||||||
|
|
||||||
Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?,
|
Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?,
|
||||||
_ => stream_text_file(file_path).await?,
|
|
||||||
|
_ => stream_text_file(file_path, false, None).await?,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Box::pin(stream))
|
Ok(Box::pin(stream))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
|
async fn stream_text_file(file_path: &str, use_md_fences: bool, fence_language: Option<String>) -> Result<ChunkStream> {
|
||||||
let file = tokio::fs::File::open(file_path).await?;
|
let file = tokio::fs::File::open(file_path).await?;
|
||||||
let reader = tokio::io::BufReader::new(file);
|
let reader = tokio::io::BufReader::new(file);
|
||||||
let mut lines = reader.lines();
|
let mut lines = reader.lines();
|
||||||
let mut line_number = 0;
|
let mut line_number = 0;
|
||||||
|
|
||||||
let stream = stream! {
|
let stream = stream! {
|
||||||
|
|
||||||
|
if use_md_fences {
|
||||||
|
match fence_language {
|
||||||
|
Some(lang) if lang.trim().is_empty() => {
|
||||||
|
yield Ok(Chunk::new("```".to_string(), Metadata::Text { line_number }));
|
||||||
|
},
|
||||||
|
|
||||||
|
Some(lang) => {
|
||||||
|
yield Ok(Chunk::new(format!("```{}", lang.trim()), Metadata::Text { line_number }));
|
||||||
|
},
|
||||||
|
|
||||||
|
None => {
|
||||||
|
yield Ok(Chunk::new("```".to_string(), Metadata::Text { line_number }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
while let Ok(Some(line)) = lines.next_line().await {
|
while let Ok(Some(line)) = lines.next_line().await {
|
||||||
line_number += 1;
|
line_number += 1;
|
||||||
yield Ok(Chunk::new(
|
yield Ok(Chunk::new(
|
||||||
@ -189,6 +231,10 @@ async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
|
|||||||
Metadata::Text { line_number }
|
Metadata::Text { line_number }
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if use_md_fences {
|
||||||
|
yield Ok(Chunk::new("```\n".to_string(), Metadata::Text { line_number }));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Box::pin(stream))
|
Ok(Box::pin(stream))
|
||||||
@ -251,7 +297,17 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result<ChunkStream> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for (row_idx, row) in range.rows().enumerate() {
|
let mut row_idx = 0;
|
||||||
|
tx.blocking_send(Ok(Chunk::new(
|
||||||
|
"```csv".to_string(),
|
||||||
|
Metadata::Spreadsheet {
|
||||||
|
sheet_name: sheet_name.clone(),
|
||||||
|
row_number: row_idx,
|
||||||
|
}
|
||||||
|
))).ok();
|
||||||
|
|
||||||
|
for row in range.rows() {
|
||||||
|
row_idx += 1;
|
||||||
let content = row.iter()
|
let content = row.iter()
|
||||||
.map(|cell| cell.to_string())
|
.map(|cell| cell.to_string())
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
@ -261,12 +317,20 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result<ChunkStream> {
|
|||||||
content,
|
content,
|
||||||
Metadata::Spreadsheet {
|
Metadata::Spreadsheet {
|
||||||
sheet_name: sheet_name.clone(),
|
sheet_name: sheet_name.clone(),
|
||||||
row_number: row_idx + 1,
|
row_number: row_idx,
|
||||||
}
|
}
|
||||||
))).is_err() {
|
))).is_err() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tx.blocking_send(Ok(Chunk::new(
|
||||||
|
"```".to_string(),
|
||||||
|
Metadata::Spreadsheet {
|
||||||
|
sheet_name: sheet_name.clone(),
|
||||||
|
row_number: row_idx,
|
||||||
|
}
|
||||||
|
))).ok();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -319,11 +383,11 @@ async fn chunk_image(file_path: &str) -> Result<ChunkStream> {
|
|||||||
Ok(Box::pin(stream))
|
Ok(Box::pin(stream))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
async fn stream_pptx(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
|
||||||
let path = Path::new(file_path).to_owned();
|
let path = Path::new(file_path).to_owned();
|
||||||
|
|
||||||
let parser_config = ParserConfig::builder()
|
let parser_config = ParserConfig::builder()
|
||||||
.extract_images(true)
|
.extract_images(extract_images)
|
||||||
.compress_images(true)
|
.compress_images(true)
|
||||||
.quality(75)
|
.quality(75)
|
||||||
.image_handling_mode(ImageHandlingMode::Manually)
|
.image_handling_mode(ImageHandlingMode::Manually)
|
||||||
@ -356,7 +420,6 @@ async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
|||||||
if let Some(images) = slide.load_images_manually() {
|
if let Some(images) = slide.load_images_manually() {
|
||||||
for image in images.iter() {
|
for image in images.iter() {
|
||||||
let base64_data = &image.base64_content;
|
let base64_data = &image.base64_content;
|
||||||
|
|
||||||
let total_length = base64_data.len();
|
let total_length = base64_data.len();
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
let mut segment_index = 0;
|
let mut segment_index = 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user