mirror of
https://github.com/MindWorkAI/AI-Studio.git
synced 2025-07-28 02:02:57 +00:00
Exposed image extraction state to file data streaming & passing this state to rust for optimizations
This commit is contained in:
parent
a17f39d28e
commit
5c05d3df3c
@ -5,10 +5,10 @@ namespace AIStudio.Tools.Services;
|
|||||||
|
|
||||||
public sealed partial class RustService
|
public sealed partial class RustService
|
||||||
{
|
{
|
||||||
public async Task<string> ReadArbitraryFileData(string path, int maxChunks)
|
public async Task<string> ReadArbitraryFileData(string path, int maxChunks, bool extractImages = false)
|
||||||
{
|
{
|
||||||
var streamId = Guid.NewGuid().ToString();
|
var streamId = Guid.NewGuid().ToString();
|
||||||
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}";
|
var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}&extract_images={extractImages}";
|
||||||
var request = new HttpRequestMessage(HttpMethod.Get, requestUri);
|
var request = new HttpRequestMessage(HttpMethod.Get, requestUri);
|
||||||
var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
||||||
|
|
||||||
@ -37,7 +37,7 @@ public sealed partial class RustService
|
|||||||
var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent);
|
var sseEvent = JsonSerializer.Deserialize<ContentStreamSseEvent>(jsonContent);
|
||||||
if (sseEvent is not null)
|
if (sseEvent is not null)
|
||||||
{
|
{
|
||||||
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, false);
|
var content = ContentStreamSseHandler.ProcessEvent(sseEvent, extractImages);
|
||||||
if(content is not null)
|
if(content is not null)
|
||||||
resultBuilder.AppendLine(content);
|
resultBuilder.AppendLine(content);
|
||||||
|
|
||||||
|
@ -81,10 +81,10 @@ const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token
|
|||||||
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
||||||
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
||||||
|
|
||||||
#[get("/retrieval/fs/extract?<path>&<stream_id>")]
|
#[get("/retrieval/fs/extract?<path>&<stream_id>&<extract_images>")]
|
||||||
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut end: Shutdown) -> EventStream![] {
|
pub async fn extract_data(_token: APIToken, path: String, stream_id: String, extract_images: bool, mut end: Shutdown) -> EventStream![] {
|
||||||
EventStream! {
|
EventStream! {
|
||||||
let stream_result = stream_data(&path).await;
|
let stream_result = stream_data(&path, extract_images).await;
|
||||||
let id_ref = &stream_id;
|
let id_ref = &stream_id;
|
||||||
|
|
||||||
match stream_result {
|
match stream_result {
|
||||||
@ -116,7 +116,7 @@ pub async fn extract_data(_token: APIToken, path: String, stream_id: String, mut
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
async fn stream_data(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
|
||||||
if !Path::new(file_path).exists() {
|
if !Path::new(file_path).exists() {
|
||||||
error!("File does not exist: '{file_path}'");
|
error!("File does not exist: '{file_path}'");
|
||||||
return Err("File does not exist.".into());
|
return Err("File does not exist.".into());
|
||||||
@ -133,14 +133,14 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
|||||||
|
|
||||||
let ext = file_path.split('.').next_back().unwrap_or("");
|
let ext = file_path.split('.').next_back().unwrap_or("");
|
||||||
debug!("Extracting data from file: '{file_path}', format: '{fmt:?}', extension: '{ext}'");
|
debug!("Extracting data from file: '{file_path}', format: '{fmt:?}', extension: '{ext}'");
|
||||||
|
|
||||||
let stream = match ext {
|
let stream = match ext {
|
||||||
DOCX | ODT => {
|
DOCX | ODT => {
|
||||||
let from = if ext == DOCX { "docx" } else { "odt" };
|
let from = if ext == DOCX { "docx" } else { "odt" };
|
||||||
convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
|
convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
|
||||||
}
|
}
|
||||||
|
|
||||||
"pptx" => stream_pptx(file_path).await?,
|
"pptx" => stream_pptx(file_path, extract_images).await?,
|
||||||
|
|
||||||
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
|
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
|
||||||
stream_spreadsheet_as_csv(file_path).await?
|
stream_spreadsheet_as_csv(file_path).await?
|
||||||
@ -163,7 +163,13 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
|||||||
|
|
||||||
Kind::Ebook => return Err("Ebooks not yet supported".into()),
|
Kind::Ebook => return Err("Ebooks not yet supported".into()),
|
||||||
|
|
||||||
Kind::Image => chunk_image(file_path).await?,
|
Kind::Image => {
|
||||||
|
if !extract_images {
|
||||||
|
return Err("Image extraction is disabled.".into());
|
||||||
|
}
|
||||||
|
|
||||||
|
chunk_image(file_path).await?
|
||||||
|
},
|
||||||
|
|
||||||
Kind::Other => match fmt {
|
Kind::Other => match fmt {
|
||||||
FileFormat::HypertextMarkupLanguage => {
|
FileFormat::HypertextMarkupLanguage => {
|
||||||
@ -175,7 +181,7 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
|||||||
|
|
||||||
Kind::Presentation => match fmt {
|
Kind::Presentation => match fmt {
|
||||||
FileFormat::OfficeOpenXmlPresentation => {
|
FileFormat::OfficeOpenXmlPresentation => {
|
||||||
stream_pptx(file_path).await?
|
stream_pptx(file_path, extract_images).await?
|
||||||
},
|
},
|
||||||
|
|
||||||
_ => stream_text_file(file_path).await?,
|
_ => stream_text_file(file_path).await?,
|
||||||
@ -334,11 +340,11 @@ async fn chunk_image(file_path: &str) -> Result<ChunkStream> {
|
|||||||
Ok(Box::pin(stream))
|
Ok(Box::pin(stream))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
async fn stream_pptx(file_path: &str, extract_images: bool) -> Result<ChunkStream> {
|
||||||
let path = Path::new(file_path).to_owned();
|
let path = Path::new(file_path).to_owned();
|
||||||
|
|
||||||
let parser_config = ParserConfig::builder()
|
let parser_config = ParserConfig::builder()
|
||||||
.extract_images(true)
|
.extract_images(extract_images)
|
||||||
.compress_images(true)
|
.compress_images(true)
|
||||||
.quality(75)
|
.quality(75)
|
||||||
.image_handling_mode(ImageHandlingMode::Manually)
|
.image_handling_mode(ImageHandlingMode::Manually)
|
||||||
|
Loading…
Reference in New Issue
Block a user