Merge branch 'pr-84'

This commit is contained in:
sanio 2025-07-08 22:43:39 +09:00
commit 55961c956a
2 changed files with 192 additions and 237 deletions

View File

@ -42,27 +42,21 @@
} }
}, },
"node_modules/@emnapi/core": { "node_modules/@emnapi/core": {
"version": "1.4.4", "version": "1.4.4",
"resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.4.4.tgz", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.4.4.tgz",
"integrity": "sha512-A9CnAbC6ARNMKcIcrQwq6HeHCjpcBZ5wSx4U01WXCqEKlrzB9F9315WDNHkrs2xbx7YjjSxbUYxuN6EQzpcY2g==", "integrity": "sha512-A9CnAbC6ARNMKcIcrQwq6HeHCjpcBZ5wSx4U01WXCqEKlrzB9F9315WDNHkrs2xbx7YjjSxbUYxuN6EQzpcY2g==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"dependencies": { "dependencies": {
"@emnapi/wasi-threads": "1.0.3", "@emnapi/wasi-threads": "1.0.3",
"tslib": "^2.4.0" "tslib": "^2.4.0"
} }
}, },
"node_modules/@emnapi/runtime": { "node_modules/@emnapi/runtime": {
"version": "1.4.4", "version": "1.4.4",
"resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.4.tgz", "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.4.tgz",
"integrity": "sha512-hHyapA4A3gPaDCNfiqyZUStTMqIkKRshqPIuDOXv1hcBnD4U3l8cP0T1HMCfGRxQ6V64TGCcoswChANyOAwbQg==", "integrity": "sha512-hHyapA4A3gPaDCNfiqyZUStTMqIkKRshqPIuDOXv1hcBnD4U3l8cP0T1HMCfGRxQ6V64TGCcoswChANyOAwbQg==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
@ -71,11 +65,9 @@
} }
}, },
"node_modules/@emnapi/wasi-threads": { "node_modules/@emnapi/wasi-threads": {
"version": "1.0.3", "version": "1.0.3",
"resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.0.3.tgz", "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.0.3.tgz",
"integrity": "sha512-8K5IFFsQqF9wQNJptGbS6FNKgUTsSRYnTqNCG1vPP8jFdjSv18n2mQfJpkt2Oibo9iBEzcDnDxNwKTzC7svlJw==", "integrity": "sha512-8K5IFFsQqF9wQNJptGbS6FNKgUTsSRYnTqNCG1vPP8jFdjSv18n2mQfJpkt2Oibo9iBEzcDnDxNwKTzC7svlJw==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
@ -2675,11 +2667,9 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/electron-to-chromium": { "node_modules/electron-to-chromium": {
"version": "1.5.180", "version": "1.5.180",
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.180.tgz", "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.180.tgz",
"integrity": "sha512-ED+GEyEh3kYMwt2faNmgMB0b8O5qtATGgR4RmRsIp4T6p7B8vdMbIedYndnvZfsaXvSzegtpfqRMDNCjjiSduA==", "integrity": "sha512-ED+GEyEh3kYMwt2faNmgMB0b8O5qtATGgR4RmRsIp4T6p7B8vdMbIedYndnvZfsaXvSzegtpfqRMDNCjjiSduA==",
"license": "ISC" "license": "ISC"
}, },
"node_modules/emoji-regex": { "node_modules/emoji-regex": {

View File

@ -1,5 +1,5 @@
const { GoogleGenerativeAI } = require('@google/generative-ai'); const { GoogleGenerativeAI } = require("@google/generative-ai")
const { GoogleGenAI } = require('@google/genai'); const { GoogleGenAI } = require("@google/genai")
/** /**
* Creates a Gemini STT session * Creates a Gemini STT session
@ -9,13 +9,14 @@ const { GoogleGenAI } = require('@google/genai');
* @param {object} [opts.callbacks] - Event callbacks * @param {object} [opts.callbacks] - Event callbacks
* @returns {Promise<object>} STT session * @returns {Promise<object>} STT session
*/ */
async function createSTT({ apiKey, language = 'en-US', callbacks = {}, ...config }) { async function createSTT({ apiKey, language = "en-US", callbacks = {}, ...config }) {
const liveClient = new GoogleGenAI({ vertexai: false, apiKey }); const liveClient = new GoogleGenAI({ vertexai: false, apiKey })
// Language code BCP-47 conversion // Language code BCP-47 conversion
const lang = language.includes('-') ? language : `${language}-US`; const lang = language.includes("-") ? language : `${language}-US`
const session = await liveClient.live.connect({ const session = await liveClient.live.connect({
model: 'gemini-live-2.5-flash-preview', model: 'gemini-live-2.5-flash-preview',
callbacks: { callbacks: {
...callbacks, ...callbacks,
@ -25,313 +26,277 @@ async function createSTT({ apiKey, language = 'en-US', callbacks = {}, ...config
callbacks.onmessage?.(msg); callbacks.onmessage?.(msg);
} }
}, },
config: { config: {
inputAudioTranscription: {}, inputAudioTranscription: {},
speechConfig: { languageCode: lang }, speechConfig: { languageCode: lang },
}, },
}); })
return { return {
sendRealtimeInput: async payload => session.sendRealtimeInput(payload), sendRealtimeInput: async (payload) => session.sendRealtimeInput(payload),
close: async () => session.close(), close: async () => session.close(),
}; }
} }
/** /**
* Creates a Gemini LLM instance * Creates a Gemini LLM instance with proper text response handling
* @param {object} opts - Configuration options
* @param {string} opts.apiKey - Gemini API key
* @param {string} [opts.model='gemini-2.5-flash'] - Model name
* @param {number} [opts.temperature=0.7] - Temperature
* @param {number} [opts.maxTokens=8192] - Max tokens
* @returns {object} LLM instance
*/ */
function createLLM({ apiKey, model = 'gemini-2.5-flash', temperature = 0.7, maxTokens = 8192, ...config }) { function createLLM({ apiKey, model = "gemini-2.5-flash", temperature = 0.7, maxTokens = 8192, ...config }) {
const client = new GoogleGenerativeAI(apiKey); const client = new GoogleGenerativeAI(apiKey)
return { return {
generateContent: async (parts) => { generateContent: async (parts) => {
const geminiModel = client.getGenerativeModel({ model: model }); const geminiModel = client.getGenerativeModel({
model: model,
generationConfig: {
temperature,
maxOutputTokens: maxTokens,
// Ensure we get text responses, not JSON
responseMimeType: "text/plain",
},
})
let systemPrompt = ''; const systemPrompt = ""
let userContent = []; const userContent = []
for (const part of parts) { for (const part of parts) {
if (typeof part === 'string') { if (typeof part === "string") {
if (systemPrompt === '' && part.includes('You are')) { // Don't automatically assume strings starting with "You are" are system prompts
systemPrompt = part; // Check if it's explicitly marked as a system instruction
} else { userContent.push(part)
userContent.push(part);
}
} else if (part.inlineData) { } else if (part.inlineData) {
// Convert base64 image data to Gemini format
userContent.push({ userContent.push({
inlineData: { inlineData: {
mimeType: part.inlineData.mimeType, mimeType: part.inlineData.mimeType,
data: part.inlineData.data data: part.inlineData.data,
},
})
} }
});
}
}
// Prepare content array
const content = [];
// Add system instruction if present
if (systemPrompt) {
// For Gemini, we'll prepend system prompt to user content
content.push(systemPrompt + '\n\n' + userContent[0]);
content.push(...userContent.slice(1));
} else {
content.push(...userContent);
} }
try { try {
const result = await geminiModel.generateContent(content); const result = await geminiModel.generateContent(userContent)
const response = await result.response; const response = await result.response
// Return plain text, not wrapped in JSON structure
return { return {
response: { response: {
text: () => response.text() text: () => response.text(),
},
} }
};
} catch (error) { } catch (error) {
console.error('Gemini API error:', error); console.error("Gemini API error:", error)
throw error; throw error
} }
}, },
// For compatibility with chat-style interfaces
chat: async (messages) => { chat: async (messages) => {
// Extract system instruction if present // Filter out any system prompts that might be causing JSON responses
let systemInstruction = ''; let systemInstruction = ""
const history = []; const history = []
let lastMessage; let lastMessage
messages.forEach((msg, index) => { messages.forEach((msg, index) => {
if (msg.role === 'system') { if (msg.role === "system") {
systemInstruction = msg.content; // Clean system instruction - avoid JSON formatting requests
return; systemInstruction = msg.content
.replace(/respond in json/gi, "")
.replace(/format.*json/gi, "")
.replace(/return.*json/gi, "")
// Add explicit instruction for natural text
if (!systemInstruction.includes("respond naturally")) {
systemInstruction += "\n\nRespond naturally in plain text, not in JSON or structured format."
}
return
} }
// Gemini's history format const role = msg.role === "user" ? "user" : "model"
const role = msg.role === 'user' ? 'user' : 'model';
if (index === messages.length - 1) { if (index === messages.length - 1) {
lastMessage = msg; lastMessage = msg
} else { } else {
history.push({ role, parts: [{ text: msg.content }] }); history.push({ role, parts: [{ text: msg.content }] })
} }
}); })
const geminiModel = client.getGenerativeModel({ const geminiModel = client.getGenerativeModel({
model: model, model: model,
systemInstruction: systemInstruction systemInstruction:
}); systemInstruction ||
"Respond naturally in plain text format. Do not use JSON or structured responses unless specifically requested.",
const chat = geminiModel.startChat({
history: history,
generationConfig: { generationConfig: {
temperature: temperature, temperature: temperature,
maxOutputTokens: maxTokens, maxOutputTokens: maxTokens,
} // Force plain text responses
}); responseMimeType: "text/plain",
},
})
// Get the last user message content const chat = geminiModel.startChat({
let content = lastMessage.content; history: history,
})
// Handle multimodal content for the last message let content = lastMessage.content
// Handle multimodal content
if (Array.isArray(content)) { if (Array.isArray(content)) {
const geminiContent = []; const geminiContent = []
for (const part of content) { for (const part of content) {
if (typeof part === 'string') { if (typeof part === "string") {
geminiContent.push(part); geminiContent.push(part)
} else if (part.type === 'text') { } else if (part.type === "text") {
geminiContent.push(part.text); geminiContent.push(part.text)
} else if (part.type === 'image_url' && part.image_url) { } else if (part.type === "image_url" && part.image_url) {
// Convert base64 image to Gemini format const base64Data = part.image_url.url.split(",")[1]
const base64Data = part.image_url.url.split(',')[1];
geminiContent.push({ geminiContent.push({
inlineData: { inlineData: {
mimeType: 'image/png', mimeType: "image/png",
data: base64Data data: base64Data,
} },
}); })
} }
} }
content = geminiContent; content = geminiContent
} }
const result = await chat.sendMessage(content); const result = await chat.sendMessage(content)
const response = await result.response; const response = await result.response
// Return plain text content
return { return {
content: response.text(), content: response.text(),
raw: result raw: result,
}; }
},
} }
};
} }
/** /**
* Creates a Gemini streaming LLM instance * Creates a Gemini streaming LLM instance with text response fix
* @param {object} opts - Configuration options
* @param {string} opts.apiKey - Gemini API key
* @param {string} [opts.model='gemini-2.5-flash'] - Model name
* @param {number} [opts.temperature=0.7] - Temperature
* @param {number} [opts.maxTokens=8192] - Max tokens
* @returns {object} Streaming LLM instance
*/ */
function createStreamingLLM({ apiKey, model = 'gemini-2.5-flash', temperature = 0.7, maxTokens = 8192, ...config }) { function createStreamingLLM({ apiKey, model = "gemini-2.5-flash", temperature = 0.7, maxTokens = 8192, ...config }) {
const client = new GoogleGenerativeAI(apiKey); const client = new GoogleGenerativeAI(apiKey)
return { return {
streamChat: async (messages) => { streamChat: async (messages) => {
console.log('[Gemini Provider] Starting streaming request'); console.log("[Gemini Provider] Starting streaming request")
// Extract system instruction if present let systemInstruction = ""
let systemInstruction = ''; const nonSystemMessages = []
const nonSystemMessages = [];
for (const msg of messages) { for (const msg of messages) {
if (msg.role === 'system') { if (msg.role === "system") {
systemInstruction = msg.content; // Clean and modify system instruction
systemInstruction = msg.content
.replace(/respond in json/gi, "")
.replace(/format.*json/gi, "")
.replace(/return.*json/gi, "")
if (!systemInstruction.includes("respond naturally")) {
systemInstruction += "\n\nRespond naturally in plain text, not in JSON or structured format."
}
} else { } else {
nonSystemMessages.push(msg); nonSystemMessages.push(msg)
} }
} }
const geminiModel = client.getGenerativeModel({ const geminiModel = client.getGenerativeModel({
model: model, model: model,
systemInstruction: systemInstruction || undefined systemInstruction:
}); systemInstruction ||
"Respond naturally in plain text format. Do not use JSON or structured responses unless specifically requested.",
const chat = geminiModel.startChat({
history: [],
generationConfig: { generationConfig: {
temperature, temperature,
maxOutputTokens: maxTokens || 8192, maxOutputTokens: maxTokens || 8192,
} // Force plain text responses
}); responseMimeType: "text/plain",
},
})
// Create a ReadableStream to handle Gemini's streaming
const stream = new ReadableStream({ const stream = new ReadableStream({
async start(controller) { async start(controller) {
try { try {
console.log('[Gemini Provider] Processing messages:', nonSystemMessages.length, 'messages (excluding system)'); const lastMessage = nonSystemMessages[nonSystemMessages.length - 1]
let geminiContent = []
// Get the last user message
const lastMessage = nonSystemMessages[nonSystemMessages.length - 1];
let lastUserMessage = lastMessage.content;
// Handle case where content might be an array (multimodal)
if (Array.isArray(lastUserMessage)) {
// Extract text content from array
const textParts = lastUserMessage.filter(part =>
typeof part === 'string' || (part && part.type === 'text')
);
lastUserMessage = textParts.map(part =>
typeof part === 'string' ? part : part.text
).join(' ');
}
console.log('[Gemini Provider] Sending message to Gemini:',
typeof lastUserMessage === 'string' ? lastUserMessage.substring(0, 100) + '...' : 'multimodal content');
// Prepare the message content for Gemini
let geminiContent = [];
// Handle multimodal content properly
if (Array.isArray(lastMessage.content)) { if (Array.isArray(lastMessage.content)) {
for (const part of lastMessage.content) { for (const part of lastMessage.content) {
if (typeof part === 'string') { if (typeof part === "string") {
geminiContent.push(part); geminiContent.push(part)
} else if (part.type === 'text') { } else if (part.type === "text") {
geminiContent.push(part.text); geminiContent.push(part.text)
} else if (part.type === 'image_url' && part.image_url) { } else if (part.type === "image_url" && part.image_url) {
// Convert base64 image to Gemini format const base64Data = part.image_url.url.split(",")[1]
const base64Data = part.image_url.url.split(',')[1];
geminiContent.push({ geminiContent.push({
inlineData: { inlineData: {
mimeType: 'image/png', mimeType: "image/png",
data: base64Data data: base64Data,
} },
}); })
} }
} }
} else { } else {
geminiContent = [lastUserMessage]; geminiContent = [lastMessage.content]
} }
console.log('[Gemini Provider] Prepared Gemini content:', const contentParts = geminiContent.map((part) => {
geminiContent.length, 'parts'); if (typeof part === "string") {
return { text: part }
// Stream the response
let chunkCount = 0;
let totalContent = '';
const contentParts = geminiContent.map(part => {
if (typeof part === 'string') {
return { text: part };
} else if (part.inlineData) { } else if (part.inlineData) {
return { inlineData: part.inlineData }; return { inlineData: part.inlineData }
} }
return part; return part
}); })
const result = await geminiModel.generateContentStream({ const result = await geminiModel.generateContentStream({
contents: [{ contents: [
role: 'user', {
parts: contentParts role: "user",
}], parts: contentParts,
generationConfig: { },
temperature, ],
maxOutputTokens: maxTokens || 8192, })
}
});
for await (const chunk of result.stream) { for await (const chunk of result.stream) {
chunkCount++; const chunkText = chunk.text() || ""
const chunkText = chunk.text() || '';
totalContent += chunkText;
// Format as SSE data // Format as SSE data - this should now be plain text
const data = JSON.stringify({ const data = JSON.stringify({
choices: [{ choices: [
{
delta: { delta: {
content: chunkText content: chunkText,
} },
}] },
}); ],
controller.enqueue(new TextEncoder().encode(`data: ${data}\n\n`)); })
controller.enqueue(new TextEncoder().encode(`data: ${data}\n\n`))
} }
console.log(`[Gemini Provider] Streamed ${chunkCount} chunks, total length: ${totalContent.length} chars`); controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n"))
controller.close()
// Send the final done message
controller.enqueue(new TextEncoder().encode('data: [DONE]\n\n'));
controller.close();
console.log('[Gemini Provider] Streaming completed successfully');
} catch (error) { } catch (error) {
console.error('[Gemini Provider] Streaming error:', error); console.error("[Gemini Provider] Streaming error:", error)
controller.error(error); controller.error(error)
} }
} },
}); })
// Create a Response object with the stream
return new Response(stream, { return new Response(stream, {
headers: { headers: {
'Content-Type': 'text/event-stream', "Content-Type": "text/event-stream",
'Cache-Control': 'no-cache', "Cache-Control": "no-cache",
'Connection': 'keep-alive' Connection: "keep-alive",
},
})
},
} }
});
}
};
} }
module.exports = { module.exports = {
createSTT, createSTT,
createLLM, createLLM,
createStreamingLLM createStreamingLLM,
}; }