diff --git a/pickleglass_web/package-lock.json b/pickleglass_web/package-lock.json index a1726d6..f1d6fa0 100644 --- a/pickleglass_web/package-lock.json +++ b/pickleglass_web/package-lock.json @@ -42,27 +42,21 @@ } }, "node_modules/@emnapi/core": { - "version": "1.4.4", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.4.4.tgz", "integrity": "sha512-A9CnAbC6ARNMKcIcrQwq6HeHCjpcBZ5wSx4U01WXCqEKlrzB9F9315WDNHkrs2xbx7YjjSxbUYxuN6EQzpcY2g==", - "dev": true, "license": "MIT", "optional": true, "dependencies": { - "@emnapi/wasi-threads": "1.0.3", - "tslib": "^2.4.0" } }, "node_modules/@emnapi/runtime": { - "version": "1.4.4", "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.4.tgz", "integrity": "sha512-hHyapA4A3gPaDCNfiqyZUStTMqIkKRshqPIuDOXv1hcBnD4U3l8cP0T1HMCfGRxQ6V64TGCcoswChANyOAwbQg==", - "dev": true, "license": "MIT", "optional": true, @@ -71,11 +65,9 @@ } }, "node_modules/@emnapi/wasi-threads": { - "version": "1.0.3", "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.0.3.tgz", "integrity": "sha512-8K5IFFsQqF9wQNJptGbS6FNKgUTsSRYnTqNCG1vPP8jFdjSv18n2mQfJpkt2Oibo9iBEzcDnDxNwKTzC7svlJw==", - "dev": true, "license": "MIT", "optional": true, @@ -2675,11 +2667,9 @@ "license": "MIT" }, "node_modules/electron-to-chromium": { - "version": "1.5.180", "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.180.tgz", "integrity": "sha512-ED+GEyEh3kYMwt2faNmgMB0b8O5qtATGgR4RmRsIp4T6p7B8vdMbIedYndnvZfsaXvSzegtpfqRMDNCjjiSduA==", - "license": "ISC" }, "node_modules/emoji-regex": { diff --git a/src/common/ai/providers/gemini.js b/src/common/ai/providers/gemini.js index be561d7..31f7e33 100644 --- a/src/common/ai/providers/gemini.js +++ b/src/common/ai/providers/gemini.js @@ -1,5 +1,5 @@ -const { GoogleGenerativeAI } = require('@google/generative-ai'); -const { GoogleGenAI } = require('@google/genai'); +const { GoogleGenerativeAI } = require("@google/generative-ai") +const { GoogleGenAI } = require("@google/genai") /** * Creates a Gemini STT session @@ -9,13 +9,14 @@ const { GoogleGenAI } = require('@google/genai'); * @param {object} [opts.callbacks] - Event callbacks * @returns {Promise} STT session */ -async function createSTT({ apiKey, language = 'en-US', callbacks = {}, ...config }) { - const liveClient = new GoogleGenAI({ vertexai: false, apiKey }); +async function createSTT({ apiKey, language = "en-US", callbacks = {}, ...config }) { + const liveClient = new GoogleGenAI({ vertexai: false, apiKey }) // Language code BCP-47 conversion - const lang = language.includes('-') ? language : `${language}-US`; + const lang = language.includes("-") ? language : `${language}-US` const session = await liveClient.live.connect({ + model: 'gemini-live-2.5-flash-preview', callbacks: { ...callbacks, @@ -25,313 +26,277 @@ async function createSTT({ apiKey, language = 'en-US', callbacks = {}, ...config callbacks.onmessage?.(msg); } }, + config: { inputAudioTranscription: {}, speechConfig: { languageCode: lang }, }, - }); + }) return { - sendRealtimeInput: async payload => session.sendRealtimeInput(payload), + sendRealtimeInput: async (payload) => session.sendRealtimeInput(payload), close: async () => session.close(), - }; + } } /** - * Creates a Gemini LLM instance - * @param {object} opts - Configuration options - * @param {string} opts.apiKey - Gemini API key - * @param {string} [opts.model='gemini-2.5-flash'] - Model name - * @param {number} [opts.temperature=0.7] - Temperature - * @param {number} [opts.maxTokens=8192] - Max tokens - * @returns {object} LLM instance + * Creates a Gemini LLM instance with proper text response handling */ -function createLLM({ apiKey, model = 'gemini-2.5-flash', temperature = 0.7, maxTokens = 8192, ...config }) { - const client = new GoogleGenerativeAI(apiKey); - +function createLLM({ apiKey, model = "gemini-2.5-flash", temperature = 0.7, maxTokens = 8192, ...config }) { + const client = new GoogleGenerativeAI(apiKey) + return { generateContent: async (parts) => { - const geminiModel = client.getGenerativeModel({ model: model }); - - let systemPrompt = ''; - let userContent = []; - + const geminiModel = client.getGenerativeModel({ + model: model, + generationConfig: { + temperature, + maxOutputTokens: maxTokens, + // Ensure we get text responses, not JSON + responseMimeType: "text/plain", + }, + }) + + const systemPrompt = "" + const userContent = [] + for (const part of parts) { - if (typeof part === 'string') { - if (systemPrompt === '' && part.includes('You are')) { - systemPrompt = part; - } else { - userContent.push(part); - } + if (typeof part === "string") { + // Don't automatically assume strings starting with "You are" are system prompts + // Check if it's explicitly marked as a system instruction + userContent.push(part) } else if (part.inlineData) { - // Convert base64 image data to Gemini format userContent.push({ inlineData: { mimeType: part.inlineData.mimeType, - data: part.inlineData.data - } - }); + data: part.inlineData.data, + }, + }) } } - - // Prepare content array - const content = []; - - // Add system instruction if present - if (systemPrompt) { - // For Gemini, we'll prepend system prompt to user content - content.push(systemPrompt + '\n\n' + userContent[0]); - content.push(...userContent.slice(1)); - } else { - content.push(...userContent); - } - + try { - const result = await geminiModel.generateContent(content); - const response = await result.response; - + const result = await geminiModel.generateContent(userContent) + const response = await result.response + + // Return plain text, not wrapped in JSON structure return { response: { - text: () => response.text() - } - }; + text: () => response.text(), + }, + } } catch (error) { - console.error('Gemini API error:', error); - throw error; + console.error("Gemini API error:", error) + throw error } }, - - // For compatibility with chat-style interfaces + chat: async (messages) => { - // Extract system instruction if present - let systemInstruction = ''; - const history = []; - let lastMessage; + // Filter out any system prompts that might be causing JSON responses + let systemInstruction = "" + const history = [] + let lastMessage messages.forEach((msg, index) => { - if (msg.role === 'system') { - systemInstruction = msg.content; - return; + if (msg.role === "system") { + // Clean system instruction - avoid JSON formatting requests + systemInstruction = msg.content + .replace(/respond in json/gi, "") + .replace(/format.*json/gi, "") + .replace(/return.*json/gi, "") + + // Add explicit instruction for natural text + if (!systemInstruction.includes("respond naturally")) { + systemInstruction += "\n\nRespond naturally in plain text, not in JSON or structured format." + } + return } - - // Gemini's history format - const role = msg.role === 'user' ? 'user' : 'model'; + + const role = msg.role === "user" ? "user" : "model" if (index === messages.length - 1) { - lastMessage = msg; + lastMessage = msg } else { - history.push({ role, parts: [{ text: msg.content }] }); + history.push({ role, parts: [{ text: msg.content }] }) } - }); - - const geminiModel = client.getGenerativeModel({ + }) + + const geminiModel = client.getGenerativeModel({ model: model, - systemInstruction: systemInstruction - }); - - const chat = geminiModel.startChat({ - history: history, + systemInstruction: + systemInstruction || + "Respond naturally in plain text format. Do not use JSON or structured responses unless specifically requested.", generationConfig: { temperature: temperature, maxOutputTokens: maxTokens, - } - }); - - // Get the last user message content - let content = lastMessage.content; - - // Handle multimodal content for the last message + // Force plain text responses + responseMimeType: "text/plain", + }, + }) + + const chat = geminiModel.startChat({ + history: history, + }) + + let content = lastMessage.content + + // Handle multimodal content if (Array.isArray(content)) { - const geminiContent = []; + const geminiContent = [] for (const part of content) { - if (typeof part === 'string') { - geminiContent.push(part); - } else if (part.type === 'text') { - geminiContent.push(part.text); - } else if (part.type === 'image_url' && part.image_url) { - // Convert base64 image to Gemini format - const base64Data = part.image_url.url.split(',')[1]; + if (typeof part === "string") { + geminiContent.push(part) + } else if (part.type === "text") { + geminiContent.push(part.text) + } else if (part.type === "image_url" && part.image_url) { + const base64Data = part.image_url.url.split(",")[1] geminiContent.push({ inlineData: { - mimeType: 'image/png', - data: base64Data - } - }); + mimeType: "image/png", + data: base64Data, + }, + }) } } - content = geminiContent; + content = geminiContent } - - const result = await chat.sendMessage(content); - const response = await result.response; + + const result = await chat.sendMessage(content) + const response = await result.response + + // Return plain text content return { content: response.text(), - raw: result - }; - } - }; + raw: result, + } + }, + } } /** - * Creates a Gemini streaming LLM instance - * @param {object} opts - Configuration options - * @param {string} opts.apiKey - Gemini API key - * @param {string} [opts.model='gemini-2.5-flash'] - Model name - * @param {number} [opts.temperature=0.7] - Temperature - * @param {number} [opts.maxTokens=8192] - Max tokens - * @returns {object} Streaming LLM instance + * Creates a Gemini streaming LLM instance with text response fix */ -function createStreamingLLM({ apiKey, model = 'gemini-2.5-flash', temperature = 0.7, maxTokens = 8192, ...config }) { - const client = new GoogleGenerativeAI(apiKey); - +function createStreamingLLM({ apiKey, model = "gemini-2.5-flash", temperature = 0.7, maxTokens = 8192, ...config }) { + const client = new GoogleGenerativeAI(apiKey) + return { streamChat: async (messages) => { - console.log('[Gemini Provider] Starting streaming request'); - - // Extract system instruction if present - let systemInstruction = ''; - const nonSystemMessages = []; - + console.log("[Gemini Provider] Starting streaming request") + + let systemInstruction = "" + const nonSystemMessages = [] + for (const msg of messages) { - if (msg.role === 'system') { - systemInstruction = msg.content; + if (msg.role === "system") { + // Clean and modify system instruction + systemInstruction = msg.content + .replace(/respond in json/gi, "") + .replace(/format.*json/gi, "") + .replace(/return.*json/gi, "") + + if (!systemInstruction.includes("respond naturally")) { + systemInstruction += "\n\nRespond naturally in plain text, not in JSON or structured format." + } } else { - nonSystemMessages.push(msg); + nonSystemMessages.push(msg) } } - - const geminiModel = client.getGenerativeModel({ + + const geminiModel = client.getGenerativeModel({ model: model, - systemInstruction: systemInstruction || undefined - }); - - const chat = geminiModel.startChat({ - history: [], + systemInstruction: + systemInstruction || + "Respond naturally in plain text format. Do not use JSON or structured responses unless specifically requested.", generationConfig: { temperature, maxOutputTokens: maxTokens || 8192, - } - }); - - // Create a ReadableStream to handle Gemini's streaming + // Force plain text responses + responseMimeType: "text/plain", + }, + }) + const stream = new ReadableStream({ async start(controller) { try { - console.log('[Gemini Provider] Processing messages:', nonSystemMessages.length, 'messages (excluding system)'); - - // Get the last user message - const lastMessage = nonSystemMessages[nonSystemMessages.length - 1]; - let lastUserMessage = lastMessage.content; - - // Handle case where content might be an array (multimodal) - if (Array.isArray(lastUserMessage)) { - // Extract text content from array - const textParts = lastUserMessage.filter(part => - typeof part === 'string' || (part && part.type === 'text') - ); - lastUserMessage = textParts.map(part => - typeof part === 'string' ? part : part.text - ).join(' '); - } - - console.log('[Gemini Provider] Sending message to Gemini:', - typeof lastUserMessage === 'string' ? lastUserMessage.substring(0, 100) + '...' : 'multimodal content'); - - // Prepare the message content for Gemini - let geminiContent = []; - - // Handle multimodal content properly + const lastMessage = nonSystemMessages[nonSystemMessages.length - 1] + let geminiContent = [] + if (Array.isArray(lastMessage.content)) { for (const part of lastMessage.content) { - if (typeof part === 'string') { - geminiContent.push(part); - } else if (part.type === 'text') { - geminiContent.push(part.text); - } else if (part.type === 'image_url' && part.image_url) { - // Convert base64 image to Gemini format - const base64Data = part.image_url.url.split(',')[1]; + if (typeof part === "string") { + geminiContent.push(part) + } else if (part.type === "text") { + geminiContent.push(part.text) + } else if (part.type === "image_url" && part.image_url) { + const base64Data = part.image_url.url.split(",")[1] geminiContent.push({ inlineData: { - mimeType: 'image/png', - data: base64Data - } - }); + mimeType: "image/png", + data: base64Data, + }, + }) } } } else { - geminiContent = [lastUserMessage]; + geminiContent = [lastMessage.content] } - - console.log('[Gemini Provider] Prepared Gemini content:', - geminiContent.length, 'parts'); - - // Stream the response - let chunkCount = 0; - let totalContent = ''; - - const contentParts = geminiContent.map(part => { - if (typeof part === 'string') { - return { text: part }; + + const contentParts = geminiContent.map((part) => { + if (typeof part === "string") { + return { text: part } } else if (part.inlineData) { - return { inlineData: part.inlineData }; + return { inlineData: part.inlineData } } - return part; - }); + return part + }) const result = await geminiModel.generateContentStream({ - contents: [{ - role: 'user', - parts: contentParts - }], - generationConfig: { - temperature, - maxOutputTokens: maxTokens || 8192, - } - }); - + contents: [ + { + role: "user", + parts: contentParts, + }, + ], + }) + for await (const chunk of result.stream) { - chunkCount++; - const chunkText = chunk.text() || ''; - totalContent += chunkText; - - // Format as SSE data + const chunkText = chunk.text() || "" + + // Format as SSE data - this should now be plain text const data = JSON.stringify({ - choices: [{ - delta: { - content: chunkText - } - }] - }); - controller.enqueue(new TextEncoder().encode(`data: ${data}\n\n`)); + choices: [ + { + delta: { + content: chunkText, + }, + }, + ], + }) + controller.enqueue(new TextEncoder().encode(`data: ${data}\n\n`)) } - - console.log(`[Gemini Provider] Streamed ${chunkCount} chunks, total length: ${totalContent.length} chars`); - - // Send the final done message - controller.enqueue(new TextEncoder().encode('data: [DONE]\n\n')); - controller.close(); - console.log('[Gemini Provider] Streaming completed successfully'); + + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() } catch (error) { - console.error('[Gemini Provider] Streaming error:', error); - controller.error(error); + console.error("[Gemini Provider] Streaming error:", error) + controller.error(error) } - } - }); - - // Create a Response object with the stream + }, + }) + return new Response(stream, { headers: { - 'Content-Type': 'text/event-stream', - 'Cache-Control': 'no-cache', - 'Connection': 'keep-alive' - } - }); - } - }; + "Content-Type": "text/event-stream", + "Cache-Control": "no-cache", + Connection: "keep-alive", + }, + }) + }, + } } module.exports = { createSTT, createLLM, - createStreamingLLM -}; + createStreamingLLM, +}