Audio Recording Action Button 🎤

After many attempts, I managed to insert an audio button action to transcribe voice to text. I share it here.

What it does is use Google Chrome’s free speech recognition API to transcribe your voice to text in the text box, after which it shows you the live transcription in the text box, and finally sends the transcription automatically, simulating an enter, here is the embed code, inject it into the footer section:

<script>
(function() {
  const MIC_CLASS = 'mic-button';

  // La función que crea e inyecta el botón de micrófono
  function injectMic() {
    document.querySelectorAll('div.flex.items-center.gap-x-2').forEach(container => {
      // Si ya existe, no duplicamos
      if (container.querySelector(`button.${MIC_CLASS}`)) return;

      const btn = document.createElement('button');
      btn.className = `${MIC_CLASS} outline-none w-8 h-8 flex items-center justify-center rounded-full duration-200 transition-colors ease-in-out`;
      btn.style.backgroundColor = 'rgba(0, 0, 0, 0.063)';
      btn.style.marginLeft = '4px';
      btn.innerHTML = `
        <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24"
             class="h-6 w-6 shrink-0 duration-200 transition-colors ease-in-out"
             style="color: rgba(0, 0, 0, 0.5);">
          <path fill="currentColor"
                d="M12 14a3 3 0 0 0 3-3V6a3 3 0 0 0-6 0v5a3 3 0 0 0 3 3Zm5-3a5 5 0 0 1-10 0H5a7 7 0 0 0 14 0h-2ZM11 21h2v-2h-2v2Z"/>
        </svg>
      `;

      btn.addEventListener('click', () => {
        const ta = document.querySelector('textarea.resize-none');
        if (!ta || !('webkitSpeechRecognition' in window)) {
          return alert('Tu navegador no soporta SpeechRecognition');
        }

        let finalTranscript = '';
        const rec = new webkitSpeechRecognition();
        rec.lang = 'es-ES';
        rec.interimResults = true;
        rec.maxAlternatives = 1;
        rec.start();

        rec.onresult = e => {
          let interim = '';
          for (let i = e.resultIndex; i < e.results.length; i++) {
            const r = e.results[i];
            if (r.isFinal) finalTranscript += r[0].transcript + ' ';
            else interim += r[0].transcript;
          }
          const text = (finalTranscript + interim).trimEnd();
          // Actualiza el textarea en vivo
          const setter = Object.getOwnPropertyDescriptor(HTMLTextAreaElement.prototype, 'value').set;
          setter.call(ta, text);
          ta.dispatchEvent(new Event('input', { bubbles: true }));
        };

        rec.onerror = err => console.error('SpeechRecognition Error:', err.error);

        rec.onend = () => {
          // Simula Enter para enviar automáticamente
          ta.dispatchEvent(new KeyboardEvent('keydown', {
            bubbles: true, cancelable: true,
            key: 'Enter', code: 'Enter', which: 13, keyCode: 13
          }));
        };
      });

      container.appendChild(btn);
    });
  }

  // Inyección inicial
  injectMic();

  // Intentamos observar el contenedor principal del chat para reinyectar si cambia
  const chatRoot = document.querySelector('div.fixed.flex.w-full.flex-col');
  const target = chatRoot || document.body;
  new MutationObserver(injectMic)
    .observe(target, { childList: true, subtree: true });
})();
</script>

Try it and tell me how it is, @nathaniel @admin_mike Is there any possibility of adding it natively?

~ all glory belongs to God

6 Likes

Wow, this is really cool – I’m assuming this would only work if the end user were using Chrome as their browser?

1 Like

Hello, very good observation. In that case, you can change the use of the Chrome API for Whisper or Deepgram, with this it will work in any browser.

@bruno12345 this is pretty cool. Thank you so much for sharing this.