Exposed image extraction state to file data streaming & passing this state to rust for optimizations

This commit is contained in:
Thorsten Sommer 2025-06-30 20:54:12 +02:00
parent a17f39d28e
commit 5c05d3df3c
Signed by: tsommer
GPG Key ID: 371BBA77A02C0108
2 changed files with 19 additions and 13 deletions

View File

@ -5,10 +5,10 @@ namespace AIStudio.Tools.Services;
public sealed partial class RustService public sealed partial class RustService
{ {
public async Task<string> ReadArbitraryFileData(string path, int maxChunks) public async Task<string> ReadArbitraryFileData(string path, int maxChunks, bool extractImages = false)
{ {
var streamId = Guid.NewGuid().ToString(); var streamId = Guid.NewGuid().ToString();
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}"; var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}&extract_images={extractImages}";
var request = new HttpRequestMessage(HttpMethod.Get, requestUri); var request = new HttpRequestMessage(HttpMethod.Get, requestUri);
var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead); var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
@ -37,7 +37,7 @@ public sealed partial class RustService
var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent); var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent);
if (sseEvent is not null) if (sseEvent is not null)
{ {
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, false); var content = ContentStreamSseHandler.ProcessEvent(sseEvent, extractImages);
if(content is not null) if(content is not null)
resultBuilder.AppendLine(content); resultBuilder.AppendLine(content);

View File

@ -81,10 +81,10 @@ const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>; type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>; type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
#[get("/retrieval/fs/extract?<path>&<stream_id>")] #[get("/retrieval/fs/extract?<path>&<stream_id>&<extract_images>")]
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut end: Shutdown) -> EventStream![] { pub async fn extract_data(_token: APIToken, path: String, stream_id: String, extract_images: bool, mut end: Shutdown) -> EventStream![] {
EventStream! { EventStream! {
let stream_result = stream_data(&path).await; let stream_result = stream_data(&path, extract_images).await;
let id_ref = &stream_id; let id_ref = &stream_id;
match stream_result { match stream_result {
@ -116,7 +116,7 @@ pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut
} }
} }
async fn stream_data(file_path: &str) -> Result<ChunkStream> { async fn stream_data(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
if !Path::new(file_path).exists() { if !Path::new(file_path).exists() {
error!("File does not exist: '{file_path}'"); error!("File does not exist: '{file_path}'");
return Err("File does not exist.".into()); return Err("File does not exist.".into());
@ -140,7 +140,7 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
convert_with_pandoc(file_path, from, TO_MARKDOWN).await? convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
} }
"pptx" => stream_pptx(file_path).await?, "pptx" => stream_pptx(file_path, extract_images).await?,
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => { "xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
stream_spreadsheet_as_csv(file_path).await? stream_spreadsheet_as_csv(file_path).await?
@ -163,7 +163,13 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
Kind::Ebook => return Err("Ebooks not yet supported".into()), Kind::Ebook => return Err("Ebooks not yet supported".into()),
Kind::Image => chunk_image(file_path).await?, Kind::Image => {
if !extract_images {
return Err("Image extraction is disabled.".into());
}
chunk_image(file_path).await?
},
Kind::Other => match fmt { Kind::Other => match fmt {
FileFormat::HypertextMarkupLanguage => { FileFormat::HypertextMarkupLanguage => {
@ -175,7 +181,7 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
Kind::Presentation => match fmt { Kind::Presentation => match fmt {
FileFormat::OfficeOpenXmlPresentation => { FileFormat::OfficeOpenXmlPresentation => {
stream_pptx(file_path).await? stream_pptx(file_path, extract_images).await?
}, },
_ => stream_text_file(file_path).await?, _ => stream_text_file(file_path).await?,
@ -334,11 +340,11 @@ async fn chunk_image(file_path: &str) -> Result<ChunkStream> {
Ok(Box::pin(stream)) Ok(Box::pin(stream))
} }
async fn stream_pptx(file_path: &str) -> Result<ChunkStream> { async fn stream_pptx(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
let path = Path::new(file_path).to_owned(); let path = Path::new(file_path).to_owned();
let parser_config = ParserConfig::builder() let parser_config = ParserConfig::builder()
.extract_images(true) .extract_images(extract_images)
.compress_images(true) .compress_images(true)
.quality(75) .quality(75)
.image_handling_mode(ImageHandlingMode::Manually) .image_handling_mode(ImageHandlingMode::Manually)