From f75c5a21b813e4f9d1fc6e51a08daabcb4941c9b Mon Sep 17 00:00:00 2001 From: Thorsten Sommer Date: Fri, 9 Jan 2026 19:02:57 +0100 Subject: [PATCH] Implemented the transcription API --- .../AlibabaCloud/ProviderAlibabaCloud.cs | 6 +++ .../Provider/Anthropic/ProviderAnthropic.cs | 6 +++ .../Provider/BaseProvider.cs | 51 +++++++++++++++++++ .../Provider/DeepSeek/ProviderDeepSeek.cs | 6 +++ .../Provider/Fireworks/ProviderFireworks.cs | 7 +++ .../Provider/GWDG/ProviderGWDG.cs | 7 +++ .../Provider/Google/ProviderGoogle.cs | 6 +++ .../Provider/Groq/ProviderGroq.cs | 6 +++ .../Provider/Helmholtz/ProviderHelmholtz.cs | 6 +++ .../HuggingFace/ProviderHuggingFace.cs | 6 +++ app/MindWork AI Studio/Provider/IProvider.cs | 10 ++++ .../Provider/Mistral/ProviderMistral.cs | 7 +++ app/MindWork AI Studio/Provider/NoProvider.cs | 2 + .../Provider/OpenAI/ProviderOpenAI.cs | 7 +++ .../Provider/OpenRouter/ProviderOpenRouter.cs | 6 +++ .../Provider/Perplexity/ProviderPerplexity.cs | 6 +++ .../Provider/SelfHosted/ProviderSelfHosted.cs | 7 +++ .../Provider/TranscriptionResponse.cs | 3 ++ .../Provider/X/ProviderX.cs | 8 ++- 19 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 app/MindWork AI Studio/Provider/TranscriptionResponse.cs diff --git a/app/MindWork AI Studio/Provider/AlibabaCloud/ProviderAlibabaCloud.cs b/app/MindWork AI Studio/Provider/AlibabaCloud/ProviderAlibabaCloud.cs index 6b648372..9d2e0792 100644 --- a/app/MindWork AI Studio/Provider/AlibabaCloud/ProviderAlibabaCloud.cs +++ b/app/MindWork AI Studio/Provider/AlibabaCloud/ProviderAlibabaCloud.cs @@ -80,6 +80,12 @@ public sealed class ProviderAlibabaCloud() : BaseProvider(LLMProviders.ALIBABA_C yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } /// public override Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/Anthropic/ProviderAnthropic.cs b/app/MindWork AI Studio/Provider/Anthropic/ProviderAnthropic.cs index 42268936..2b45cc44 100644 --- a/app/MindWork AI Studio/Provider/Anthropic/ProviderAnthropic.cs +++ b/app/MindWork AI Studio/Provider/Anthropic/ProviderAnthropic.cs @@ -107,6 +107,12 @@ public sealed class ProviderAnthropic() : BaseProvider(LLMProviders.ANTHROPIC, " yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } /// public override Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/BaseProvider.cs b/app/MindWork AI Studio/Provider/BaseProvider.cs index c5594087..9801e16b 100644 --- a/app/MindWork AI Studio/Provider/BaseProvider.cs +++ b/app/MindWork AI Studio/Provider/BaseProvider.cs @@ -1,4 +1,5 @@ using System.Net; +using System.Net.Http.Headers; using System.Runtime.CompilerServices; using System.Text.Json; using System.Text.Json.Serialization; @@ -6,10 +7,15 @@ using System.Text.Json.Serialization; using AIStudio.Chat; using AIStudio.Provider.Anthropic; using AIStudio.Provider.OpenAI; +using AIStudio.Provider.SelfHosted; using AIStudio.Settings; +using AIStudio.Tools.MIME; using AIStudio.Tools.PluginSystem; +using AIStudio.Tools.Rust; using AIStudio.Tools.Services; +using Host = AIStudio.Provider.SelfHosted.Host; + namespace AIStudio.Provider; /// @@ -89,6 +95,9 @@ public abstract class BaseProvider : IProvider, ISecretId /// public abstract IAsyncEnumerable StreamImageCompletion(Model imageModel, string promptPositive, string promptNegative = FilterOperator.String.Empty, ImageURL referenceImageURL = default, CancellationToken token = default); + /// + public abstract Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default); + /// public abstract Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default); @@ -536,6 +545,48 @@ public abstract class BaseProvider : IProvider, ISecretId streamReader.Dispose(); } + protected async Task PerformStandardTranscriptionRequest(RequestedSecret requestedSecret, Model transcriptionModel, string audioFilePath, Host host = Host.NONE, CancellationToken token = default) + { + try + { + using var form = new MultipartFormDataContent(); + var mimeType = Builder.FromFilename(audioFilePath); + + await using var fileStream = File.OpenRead(audioFilePath); + using var fileContent = new StreamContent(fileStream); + fileContent.Headers.ContentType = new MediaTypeHeaderValue(mimeType); + + form.Add(fileContent, "file", Path.GetFileName(audioFilePath)); + form.Add(new StringContent(transcriptionModel.Id), "model"); + + using var request = new HttpRequestMessage(HttpMethod.Post, host.TranscriptionURL()); + request.Content = form; + + if(requestedSecret.Success) + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", await requestedSecret.Secret.Decrypt(ENCRYPTION)); + + using var response = await this.httpClient.SendAsync(request, token); + var responseBody = response.Content.ReadAsStringAsync(token).Result; + + if (!response.IsSuccessStatusCode) + return string.Empty; + + var transcriptionResponse = JsonSerializer.Deserialize(responseBody, JSON_SERIALIZER_OPTIONS); + if(transcriptionResponse is null) + { + this.logger.LogError("Was not able to deserialize the transcription response."); + return string.Empty; + } + + return transcriptionResponse.Text; + } + catch (Exception e) + { + this.logger.LogError("Failed to perform transcription request: '{Message}'.", e.Message); + return string.Empty; + } + } + /// /// Parse and convert API parameters from a provided JSON string into a dictionary, /// optionally merging additional parameters and removing specific keys. diff --git a/app/MindWork AI Studio/Provider/DeepSeek/ProviderDeepSeek.cs b/app/MindWork AI Studio/Provider/DeepSeek/ProviderDeepSeek.cs index b2715f47..39ecd21e 100644 --- a/app/MindWork AI Studio/Provider/DeepSeek/ProviderDeepSeek.cs +++ b/app/MindWork AI Studio/Provider/DeepSeek/ProviderDeepSeek.cs @@ -80,6 +80,12 @@ public sealed class ProviderDeepSeek() : BaseProvider(LLMProviders.DEEP_SEEK, "h yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } /// public override Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/Fireworks/ProviderFireworks.cs b/app/MindWork AI Studio/Provider/Fireworks/ProviderFireworks.cs index 9450134d..a3f27a07 100644 --- a/app/MindWork AI Studio/Provider/Fireworks/ProviderFireworks.cs +++ b/app/MindWork AI Studio/Provider/Fireworks/ProviderFireworks.cs @@ -81,6 +81,13 @@ public class ProviderFireworks() : BaseProvider(LLMProviders.FIREWORKS, "https:/ yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override async Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + var requestedSecret = await RUST_SERVICE.GetAPIKey(this, isTrying: true); + return await this.PerformStandardTranscriptionRequest(requestedSecret, transcriptionModel, audioFilePath, token: token); + } /// public override Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/GWDG/ProviderGWDG.cs b/app/MindWork AI Studio/Provider/GWDG/ProviderGWDG.cs index da322942..16686b31 100644 --- a/app/MindWork AI Studio/Provider/GWDG/ProviderGWDG.cs +++ b/app/MindWork AI Studio/Provider/GWDG/ProviderGWDG.cs @@ -80,6 +80,13 @@ public sealed class ProviderGWDG() : BaseProvider(LLMProviders.GWDG, "https://ch yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override async Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + var requestedSecret = await RUST_SERVICE.GetAPIKey(this, isTrying: true); + return await this.PerformStandardTranscriptionRequest(requestedSecret, transcriptionModel, audioFilePath, token: token); + } /// public override async Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/Google/ProviderGoogle.cs b/app/MindWork AI Studio/Provider/Google/ProviderGoogle.cs index fce1a451..176bbeb5 100644 --- a/app/MindWork AI Studio/Provider/Google/ProviderGoogle.cs +++ b/app/MindWork AI Studio/Provider/Google/ProviderGoogle.cs @@ -82,6 +82,12 @@ public class ProviderGoogle() : BaseProvider(LLMProviders.GOOGLE, "https://gener } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + /// + public override Task TranscribeAudioAsync(Provider.Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } + /// public override async Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) { diff --git a/app/MindWork AI Studio/Provider/Groq/ProviderGroq.cs b/app/MindWork AI Studio/Provider/Groq/ProviderGroq.cs index b6e9137a..0bbc616f 100644 --- a/app/MindWork AI Studio/Provider/Groq/ProviderGroq.cs +++ b/app/MindWork AI Studio/Provider/Groq/ProviderGroq.cs @@ -81,6 +81,12 @@ public class ProviderGroq() : BaseProvider(LLMProviders.GROQ, "https://api.groq. yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } /// public override Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/Helmholtz/ProviderHelmholtz.cs b/app/MindWork AI Studio/Provider/Helmholtz/ProviderHelmholtz.cs index 213bf075..cc52cd20 100644 --- a/app/MindWork AI Studio/Provider/Helmholtz/ProviderHelmholtz.cs +++ b/app/MindWork AI Studio/Provider/Helmholtz/ProviderHelmholtz.cs @@ -80,6 +80,12 @@ public sealed class ProviderHelmholtz() : BaseProvider(LLMProviders.HELMHOLTZ, " yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } /// public override async Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/HuggingFace/ProviderHuggingFace.cs b/app/MindWork AI Studio/Provider/HuggingFace/ProviderHuggingFace.cs index 794b4f42..a9778988 100644 --- a/app/MindWork AI Studio/Provider/HuggingFace/ProviderHuggingFace.cs +++ b/app/MindWork AI Studio/Provider/HuggingFace/ProviderHuggingFace.cs @@ -85,6 +85,12 @@ public sealed class ProviderHuggingFace : BaseProvider yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } /// public override Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/IProvider.cs b/app/MindWork AI Studio/Provider/IProvider.cs index 4ae6dc6c..5c390074 100644 --- a/app/MindWork AI Studio/Provider/IProvider.cs +++ b/app/MindWork AI Studio/Provider/IProvider.cs @@ -50,6 +50,16 @@ public interface IProvider /// The image completion stream. public IAsyncEnumerable StreamImageCompletion(Model imageModel, string promptPositive, string promptNegative = FilterOperator.String.Empty, ImageURL referenceImageURL = default, CancellationToken token = default); + /// + /// Transcribe an audio file. + /// + /// The model to use for transcription. + /// The audio file path. + /// The settings manager instance to use. + /// The cancellation token. + /// >The transcription result. + public Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default); + /// /// Load all possible text models that can be used with this provider. /// diff --git a/app/MindWork AI Studio/Provider/Mistral/ProviderMistral.cs b/app/MindWork AI Studio/Provider/Mistral/ProviderMistral.cs index 598f7016..522b9e4d 100644 --- a/app/MindWork AI Studio/Provider/Mistral/ProviderMistral.cs +++ b/app/MindWork AI Studio/Provider/Mistral/ProviderMistral.cs @@ -81,6 +81,13 @@ public sealed class ProviderMistral() : BaseProvider(LLMProviders.MISTRAL, "http yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override async Task TranscribeAudioAsync(Provider.Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + var requestedSecret = await RUST_SERVICE.GetAPIKey(this, isTrying: true); + return await this.PerformStandardTranscriptionRequest(requestedSecret, transcriptionModel, audioFilePath, token: token); + } /// public override async Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/NoProvider.cs b/app/MindWork AI Studio/Provider/NoProvider.cs index 4f92e5c9..a650ac34 100644 --- a/app/MindWork AI Studio/Provider/NoProvider.cs +++ b/app/MindWork AI Studio/Provider/NoProvider.cs @@ -38,6 +38,8 @@ public class NoProvider : IProvider yield break; } + public Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) => Task.FromResult(string.Empty); + public IReadOnlyCollection GetModelCapabilities(Model model) => [ Capability.NONE ]; #endregion diff --git a/app/MindWork AI Studio/Provider/OpenAI/ProviderOpenAI.cs b/app/MindWork AI Studio/Provider/OpenAI/ProviderOpenAI.cs index d06d6e15..76521cd4 100644 --- a/app/MindWork AI Studio/Provider/OpenAI/ProviderOpenAI.cs +++ b/app/MindWork AI Studio/Provider/OpenAI/ProviderOpenAI.cs @@ -217,6 +217,13 @@ public sealed class ProviderOpenAI() : BaseProvider(LLMProviders.OPEN_AI, "https } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override async Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + var requestedSecret = await RUST_SERVICE.GetAPIKey(this, isTrying: true); + return await this.PerformStandardTranscriptionRequest(requestedSecret, transcriptionModel, audioFilePath, token: token); + } /// public override async Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/OpenRouter/ProviderOpenRouter.cs b/app/MindWork AI Studio/Provider/OpenRouter/ProviderOpenRouter.cs index d6945799..79d02de6 100644 --- a/app/MindWork AI Studio/Provider/OpenRouter/ProviderOpenRouter.cs +++ b/app/MindWork AI Studio/Provider/OpenRouter/ProviderOpenRouter.cs @@ -88,6 +88,12 @@ public sealed class ProviderOpenRouter() : BaseProvider(LLMProviders.OPEN_ROUTER yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } /// public override Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/Perplexity/ProviderPerplexity.cs b/app/MindWork AI Studio/Provider/Perplexity/ProviderPerplexity.cs index 0616f2d9..38c6f9b7 100644 --- a/app/MindWork AI Studio/Provider/Perplexity/ProviderPerplexity.cs +++ b/app/MindWork AI Studio/Provider/Perplexity/ProviderPerplexity.cs @@ -88,6 +88,12 @@ public sealed class ProviderPerplexity() : BaseProvider(LLMProviders.PERPLEXITY, yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } /// public override Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) diff --git a/app/MindWork AI Studio/Provider/SelfHosted/ProviderSelfHosted.cs b/app/MindWork AI Studio/Provider/SelfHosted/ProviderSelfHosted.cs index a61a3b26..8b472c09 100644 --- a/app/MindWork AI Studio/Provider/SelfHosted/ProviderSelfHosted.cs +++ b/app/MindWork AI Studio/Provider/SelfHosted/ProviderSelfHosted.cs @@ -88,6 +88,13 @@ public sealed class ProviderSelfHosted(Host host, string hostname) : BaseProvide } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + /// + public override async Task TranscribeAudioAsync(Provider.Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + var requestedSecret = await RUST_SERVICE.GetAPIKey(this, isTrying: true); + return await this.PerformStandardTranscriptionRequest(requestedSecret, transcriptionModel, audioFilePath, host, token); + } + public override async Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) { try diff --git a/app/MindWork AI Studio/Provider/TranscriptionResponse.cs b/app/MindWork AI Studio/Provider/TranscriptionResponse.cs new file mode 100644 index 00000000..7ba1f587 --- /dev/null +++ b/app/MindWork AI Studio/Provider/TranscriptionResponse.cs @@ -0,0 +1,3 @@ +namespace AIStudio.Provider; + +public sealed record TranscriptionResponse(string Text); diff --git a/app/MindWork AI Studio/Provider/X/ProviderX.cs b/app/MindWork AI Studio/Provider/X/ProviderX.cs index 92aad1eb..373a3b58 100644 --- a/app/MindWork AI Studio/Provider/X/ProviderX.cs +++ b/app/MindWork AI Studio/Provider/X/ProviderX.cs @@ -81,7 +81,13 @@ public sealed class ProviderX() : BaseProvider(LLMProviders.X, "https://api.x.ai yield break; } #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - + + /// + public override Task TranscribeAudioAsync(Model transcriptionModel, string audioFilePath, SettingsManager settingsManager, CancellationToken token = default) + { + return Task.FromResult(string.Empty); + } + /// public override async Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) {