From 7cbf2d04a9dfe1c7a6fbcab4394ab22734816ea8 Mon Sep 17 00:00:00 2001 From: syntaxbullet Date: Tue, 17 Feb 2026 15:08:53 +0100 Subject: [PATCH] Initial commit: Calliope voice-to-text macOS menu bar app Co-Authored-By: Claude Opus 4.6 --- .gitignore | 22 +++ CLAUDE.md | 42 ++++++ LICENSE | 21 +++ README.md | 86 +++++++++++ calliope/__init__.py | 0 calliope/app.py | 302 +++++++++++++++++++++++++++++++++++++ calliope/cli.py | 55 +++++++ calliope/config.py | 85 +++++++++++ calliope/hotkeys.py | 106 +++++++++++++ calliope/overlay.py | 313 +++++++++++++++++++++++++++++++++++++++ calliope/recorder.py | 88 +++++++++++ calliope/setup_wizard.py | 147 ++++++++++++++++++ calliope/transcriber.py | 73 +++++++++ calliope/typer.py | 64 ++++++++ pyproject.toml | 27 ++++ 15 files changed, 1431 insertions(+) create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 calliope/__init__.py create mode 100644 calliope/app.py create mode 100644 calliope/cli.py create mode 100644 calliope/config.py create mode 100644 calliope/hotkeys.py create mode 100644 calliope/overlay.py create mode 100644 calliope/recorder.py create mode 100644 calliope/setup_wizard.py create mode 100644 calliope/transcriber.py create mode 100644 calliope/typer.py create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..18f89a3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +__pycache__/ +*.py[cod] +*$py.class +*.so +*.egg-info/ +*.egg +dist/ +build/ +.eggs/ +*.whl +.venv/ +venv/ +env/ +.env +*.log +.DS_Store +*.swp +*.swo +*~ +.idea/ +.vscode/ +*.iml diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..a48fe58 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,42 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What is Calliope? + +A macOS menu bar app for local voice-to-text. Users press a hotkey, speak, and transcribed text is typed into the focused app. Runs entirely offline using Whisper models via Hugging Face Transformers + PyTorch. + +## Setup & Running + +```bash +pip install -e . # Install in dev mode +calliope # Launch (runs setup wizard on first run) +calliope setup # Re-run setup wizard +calliope --debug # Launch with debug logging +calliope --device 2 --model openai/whisper-large-v3 # Override config +``` + +No test suite or linter is configured yet. + +## Architecture + +**Entry point:** `calliope/cli.py` → Click CLI → `calliope/app.py:main()` + +**Data flow:** Hotkey press → Record audio → Transcribe with Whisper → Type into focused app + +Key modules in `calliope/`: + +- **app.py** — `CalliopeApp(rumps.App)`: main orchestrator, manages menu bar UI and coordinates all components +- **recorder.py** — Audio capture via `sounddevice` at 16kHz mono float32, with chunk consolidation +- **transcriber.py** — Whisper STT using HF `transformers.pipeline("automatic-speech-recognition")` +- **hotkeys.py** — `HotkeyListener` using `pynput`: supports push-to-talk (Ctrl+Shift hold) and toggle (Ctrl+Space) modes +- **typer.py** — Outputs text via Quartz CGEvents (character mode) or clipboard paste (Cmd+V) +- **overlay.py** — `WaveformOverlay`: floating NSPanel with scrolling waveform during recording, pulsing dots during transcription +- **setup_wizard.py** — Rich-based interactive first-run config (mic, hotkeys, model download) +- **config.py** — Loads/saves YAML config at `~/.config/calliope/config.yaml` + +## Platform Constraints + +- **macOS only** — uses `pyobjc` bindings (Quartz, AppKit, AVFoundation, ApplicationServices) +- **MPS (Apple Silicon):** must use float32, not float16 (causes garbled Whisper output) +- Requires Accessibility and Microphone permissions in macOS System Settings diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..5eeb562 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Calliope Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ecd93ed --- /dev/null +++ b/README.md @@ -0,0 +1,86 @@ +# Calliope + +Voice-to-text for macOS — speak and type into any app. + +Calliope sits in your menu bar, listens when you hold a hotkey, transcribes your speech with Whisper, and types the result into whatever app is focused. No cloud, no API keys — everything runs locally on your Mac. + +## Installation + +```bash +git clone https://github.com/yourname/calliope.git +cd calliope +pip install -e . +``` + +## Usage + +```bash +# First run — launches the setup wizard, then starts the app +calliope + +# Re-run the setup wizard +calliope setup + +# Launch with overrides +calliope --device 2 --model openai/whisper-large-v3 --debug + +# Print version +calliope --version +``` + +## Hotkeys + +| Action | Default | Description | +|--------|---------|-------------| +| Push-to-talk | `Ctrl+Shift` (hold) | Records while held, transcribes on release | +| Toggle | `Ctrl+Space` | Start/stop recording | + +Hotkeys are configurable via the setup wizard or `~/.config/calliope/config.yaml`. + +## Permissions + +Calliope needs two macOS permissions: + +- **Accessibility** — to type text into other apps (System Settings > Privacy & Security > Accessibility) +- **Microphone** — to record audio (System Settings > Privacy & Security > Microphone) + +The setup wizard checks for these and can open System Settings for you. + +## Configuration + +Config lives at `~/.config/calliope/config.yaml`: + +```yaml +device: null # sounddevice index; null = system default +model: distil-whisper/distil-large-v3 +hotkeys: + ptt: ctrl+shift + toggle: ctrl+space +context: "" # domain-specific terms to help Whisper +debug: false +``` + +CLI flags override config values for that session. + +## Troubleshooting + +**"Status: Model load failed"** +Check that you have enough disk space and RAM. The default model needs ~1.5 GB. Run with `--debug` for detailed logs. + +**No text appears after transcribing** +Make sure Accessibility permission is granted. Restart Calliope after granting it. + +**Wrong microphone** +Run `calliope setup` to pick a different input device, or set `device` in the config file. Use `python -m sounddevice` to list devices. + +**Hotkeys not working** +Ensure no other app is capturing the same key combo. Customize hotkeys via `calliope setup`. + +## Remaining TODOs + +- LICENSE file +- Unit tests +- CI/CD pipeline +- Homebrew formula +- `.app` bundle for drag-and-drop install +- Changelog diff --git a/calliope/__init__.py b/calliope/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/calliope/app.py b/calliope/app.py new file mode 100644 index 0000000..9c1706e --- /dev/null +++ b/calliope/app.py @@ -0,0 +1,302 @@ +"""Calliope — Voice-to-text macOS menu bar app.""" + +import logging +import os +import threading +import time +from typing import Any + +# Disable tokenizers parallelism to avoid leaked semaphore warnings on shutdown. +os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") +# Run offline — models are downloaded during setup, no need to hit HuggingFace on every launch. +os.environ.setdefault("HF_HUB_OFFLINE", "1") + +import rumps + +from calliope import config as config_mod +from calliope.recorder import Recorder +from calliope.transcriber import Transcriber +from calliope.typer import type_text, type_text_clipboard +from calliope.hotkeys import HotkeyListener +from calliope.overlay import WaveformOverlay + +log = logging.getLogger(__name__) + + + +class CalliopeApp(rumps.App): + def __init__(self, cfg: dict[str, Any] | None = None): + super().__init__("Calliope", title="\U0001f3a4", quit_button=None) # 🎤 + + if cfg is None: + cfg = config_mod.load() + + self.cfg = cfg + self.overlay = WaveformOverlay() + self.recorder = Recorder(device=cfg.get("device")) + self.transcriber = Transcriber( + model=cfg.get("model", "distil-whisper/distil-large-v3"), + ) + self.transcriber.context = cfg.get("context", "") + self.transcriber.language = cfg.get("language", "auto") + + self._recording = False + self._rec_lock = threading.Lock() + self._rec_start_time: float | None = None + self._rec_timer: rumps.Timer | None = None + + self.status_item = rumps.MenuItem("Status: Loading model...") + self.status_item.set_callback(None) + self.toggle_item = rumps.MenuItem("Start Recording", callback=self._on_toggle_click) + self.context_item = rumps.MenuItem("Set Whisper Context...", callback=self._on_set_context) + + # Language submenu + self._lang_menu = rumps.MenuItem("Language") + current_lang = cfg.get("language", "auto") + for display_name, code in config_mod.LANGUAGES.items(): + prefix = "\u2713 " if code == current_lang else " " + item = rumps.MenuItem(f"{prefix}{display_name}", callback=self._on_language_select) + self._lang_menu.add(item) + + # Model submenu + self._model_menu = rumps.MenuItem("Model") + current_model = cfg.get("model", "distil-whisper/distil-large-v3") + for model_id in config_mod.MODELS: + short = model_id.split("/")[-1] + prefix = "\u2713 " if model_id == current_model else " " + item = rumps.MenuItem(f"{prefix}{short}", callback=self._on_model_select) + self._model_menu.add(item) + + quit_item = rumps.MenuItem("Quit Calliope", callback=self._on_quit) + + self.menu = [ + self.status_item, + None, + self.toggle_item, + self.context_item, + self._lang_menu, + self._model_menu, + None, + quit_item, + ] + + hotkey_cfg = cfg.get("hotkeys", {}) + self.hotkeys = HotkeyListener( + on_push_to_talk_start=self._start_recording, + on_push_to_talk_stop=self._stop_and_transcribe, + on_toggle=self._toggle_recording, + ptt_combo=hotkey_cfg.get("ptt", "ctrl+shift"), + toggle_combo=hotkey_cfg.get("toggle", "ctrl+space"), + ) + + # Load model in background + threading.Thread(target=self._load_model, daemon=True).start() + + def _load_model(self) -> None: + try: + self.transcriber.load() + self.status_item.title = "Status: Ready" + self.hotkeys.start() + log.info("Model loaded, hotkeys active") + except Exception: + log.error("Failed to load model", exc_info=True) + self.status_item.title = "Status: Model load failed" + try: + rumps.notification("Calliope", "Error", "Failed to load Whisper model. Check logs.") + except RuntimeError: + pass + + @staticmethod + def _activate_app(): + """Temporarily become a regular app so dialog text fields receive focus.""" + from AppKit import NSApplication, NSApplicationActivationPolicyRegular + app = NSApplication.sharedApplication() + app.setActivationPolicy_(NSApplicationActivationPolicyRegular) + app.activateIgnoringOtherApps_(True) + + @staticmethod + def _deactivate_app(): + """Revert to accessory app (no Dock icon).""" + from AppKit import NSApplication, NSApplicationActivationPolicyAccessory + NSApplication.sharedApplication().setActivationPolicy_(NSApplicationActivationPolicyAccessory) + + def _on_set_context(self, sender) -> None: + self._activate_app() + response = rumps.Window( + message="Provide context to help Whisper with domain-specific terms, " + "names, or jargon. For example:\n\n" + "\"Meeting about Kubernetes, gRPC, and the Istio service mesh.\"", + title="Set Whisper Context", + default_text=self.transcriber.context, + ok="Save", + cancel="Clear", + dimensions=(320, 120), + ).run() + if response.clicked == 1: # Save + self.transcriber.context = response.text.strip() + else: # Clear + self.transcriber.context = "" + self._deactivate_app() + ctx = self.transcriber.context + self.context_item.title = f"Set Whisper Context... ({ctx[:20]}...)" if ctx else "Set Whisper Context..." + + def _on_language_select(self, sender) -> None: + display_name = sender.title.strip().lstrip("\u2713").strip() + code = config_mod.LANGUAGES.get(display_name, "auto") + self.transcriber.language = code + # Update checkmarks + for item in self._lang_menu.values(): + name = item.title.strip().lstrip("\u2713").strip() + item.title = f"\u2713 {name}" if config_mod.LANGUAGES.get(name) == code else f" {name}" + self.cfg["language"] = code + config_mod.save(self.cfg) + log.info("Language set to %s (%s)", display_name, code) + + def _on_model_select(self, sender) -> None: + short_name = sender.title.strip().lstrip("\u2713").strip() + # Find full model ID + model_id = None + for m in config_mod.MODELS: + if m.split("/")[-1] == short_name: + model_id = m + break + if model_id is None or model_id == self.transcriber.model: + return + # Update checkmarks + for item in self._model_menu.values(): + name = item.title.strip().lstrip("\u2713").strip() + item.title = f"\u2713 {name}" if name == short_name else f" {name}" + self.cfg["model"] = model_id + config_mod.save(self.cfg) + self.status_item.title = "Status: Loading model..." + self.hotkeys.stop() + self._release_transcriber() + self.transcriber = Transcriber(model=model_id) + self.transcriber.context = self.cfg.get("context", "") + self.transcriber.language = self.cfg.get("language", "auto") + threading.Thread(target=self._load_model, daemon=True).start() + log.info("Switching model to %s", model_id) + + def _release_transcriber(self) -> None: + """Free the current Whisper model to reclaim GPU memory.""" + if self.transcriber is not None: + self.transcriber._pipe = None + self.transcriber._tokenizer = None + import torch + if torch.backends.mps.is_available(): + torch.mps.empty_cache() + + def _on_toggle_click(self, sender) -> None: + self._toggle_recording() + + def _toggle_recording(self) -> None: + if self._recording: + self._stop_and_transcribe() + else: + self._start_recording() + + def _start_recording(self) -> None: + with self._rec_lock: + if self._recording: + return + self._recording = True + self._rec_start_time = time.time() + self.title = "\U0001f534 0:00" # 🔴 + self.toggle_item.title = "Stop Recording" + self.status_item.title = "Status: Recording..." + self.recorder.on_audio = self.overlay.push_samples + try: + self.recorder.start() + except Exception: + log.error("Failed to start recording", exc_info=True) + with self._rec_lock: + self._recording = False + self.title = "\U0001f3a4" # 🎤 + self.toggle_item.title = "Start Recording" + self.status_item.title = "Status: Mic error (check device)" + try: + rumps.notification("Calliope", "", "Microphone unavailable — check audio device") + except RuntimeError: + pass + return + self.overlay.show() + self._rec_timer = rumps.Timer(self._update_rec_duration, 1) + self._rec_timer.start() + try: + rumps.notification("Calliope", "", "Recording started") + except RuntimeError: + pass # Info.plist missing CFBundleIdentifier + log.info("Recording started") + + def _stop_and_transcribe(self) -> None: + with self._rec_lock: + if not self._recording: + return + self._recording = False + if self._rec_timer: + self._rec_timer.stop() + self._rec_timer = None + duration = int(time.time() - self._rec_start_time) if self._rec_start_time else 0 + self._rec_start_time = None + self.title = "\U0001f3a4" # 🎤 + self.toggle_item.title = "Start Recording" + self.status_item.title = "Status: Transcribing..." + self.overlay.show_transcribing() + + audio = self.recorder.stop() + try: + rumps.notification("Calliope", "", f"Recording stopped ({duration}s)") + except RuntimeError: + pass + log.info("Recording stopped, %d samples", audio.size) + threading.Thread(target=self._transcribe_and_type, args=(audio,), daemon=True).start() + + def _update_rec_duration(self, timer) -> None: + if self._rec_start_time is None: + return + elapsed = int(time.time() - self._rec_start_time) + minutes, seconds = divmod(elapsed, 60) + self.title = f"\U0001f534 {minutes}:{seconds:02d}" + + def _transcribe_and_type(self, audio) -> None: + try: + text = self.transcriber.transcribe(audio) + if text: + def _do_type(): + try: + if self.cfg.get("typing_mode", "char") == "clipboard": + type_text_clipboard(text) + else: + type_text(text) + print(f"\n[Calliope] {text}") + log.info("Typed %d chars", len(text)) + except Exception: + log.error("Typing failed", exc_info=True) + from PyObjCTools.AppHelper import callAfter + callAfter(_do_type) + self.overlay.hide() + self.status_item.title = "Status: Ready" + except Exception: + log.error("Transcription failed", exc_info=True) + self.overlay.hide() + self.status_item.title = "Status: Ready" + try: + rumps.notification("Calliope", "Error", "Transcription failed. Check logs.") + except RuntimeError: + pass + + def _on_quit(self, sender) -> None: + self.hotkeys.stop() + self.recorder.stop() + # Stop overlay timers synchronously to avoid retain cycles on quit. + self.overlay.cleanup() + rumps.quit_application() + + +def main(): + from calliope.cli import cli + cli() + + +if __name__ == "__main__": + main() diff --git a/calliope/cli.py b/calliope/cli.py new file mode 100644 index 0000000..8732aa5 --- /dev/null +++ b/calliope/cli.py @@ -0,0 +1,55 @@ +"""CLI entry point using click.""" + +import logging + +import click + +from calliope import config + + +@click.group(invoke_without_command=True) +@click.option("--device", type=int, default=None, help="Audio input device index.") +@click.option("--model", type=str, default=None, help="Whisper model name.") +@click.option("--context", type=str, default=None, help="Transcription context prompt.") +@click.option("--debug", is_flag=True, default=False, help="Enable debug logging.") +@click.version_option(package_name="calliope") +@click.pass_context +def cli(ctx, device, model, context, debug): + """Calliope — Voice-to-text for macOS.""" + level = logging.DEBUG if debug else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + ) + + cfg = config.load() + + # CLI flags override config + if device is not None: + cfg["device"] = device + if model is not None: + cfg["model"] = model + if context is not None: + cfg["context"] = context + if debug: + cfg["debug"] = True + + ctx.ensure_object(dict) + ctx.obj["cfg"] = cfg + + if ctx.invoked_subcommand is None: + # First run → wizard, then launch + if not config.exists(): + from calliope.setup_wizard import run as run_wizard + cfg = run_wizard() + ctx.obj["cfg"] = cfg + + from calliope.app import CalliopeApp + CalliopeApp(cfg).run() + + +@cli.command() +def setup(): + """Re-run the setup wizard.""" + from calliope.setup_wizard import run as run_wizard + run_wizard() diff --git a/calliope/config.py b/calliope/config.py new file mode 100644 index 0000000..60e1b97 --- /dev/null +++ b/calliope/config.py @@ -0,0 +1,85 @@ +"""Persistent YAML config at ~/.config/calliope/config.yaml.""" + +import logging +from pathlib import Path +from typing import Any + +import yaml + +log = logging.getLogger(__name__) + +CONFIG_DIR = Path.home() / ".config" / "calliope" +CONFIG_PATH = CONFIG_DIR / "config.yaml" + +DEFAULTS: dict[str, Any] = { + "device": None, # sounddevice index; None = system default + "model": "distil-whisper/distil-large-v3", + "language": "auto", + "hotkeys": { + "ptt": "ctrl+shift", + "toggle": "ctrl+space", + }, + "context": "", + "debug": False, + "typing_mode": "char", # "char" or "clipboard" +} + +LANGUAGES: dict[str, str] = { + "Auto": "auto", + "English": "en", + "Spanish": "es", + "French": "fr", + "German": "de", + "Japanese": "ja", + "Chinese": "zh", + "Korean": "ko", + "Portuguese": "pt", + "Italian": "it", + "Dutch": "nl", + "Russian": "ru", +} + +MODELS: list[str] = [ + "distil-whisper/distil-large-v3", + "openai/whisper-large-v3", + "openai/whisper-base", + "openai/whisper-small", + "openai/whisper-medium", +] + + +def _deep_merge(base: dict, override: dict) -> dict: + """Recursively merge override into base, returning a new dict.""" + result = dict(base) + for key, value in override.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = _deep_merge(result[key], value) + else: + result[key] = value + return result + + +def load() -> dict[str, Any]: + """Load config from disk, falling back to defaults.""" + cfg = dict(DEFAULTS) + if CONFIG_PATH.exists(): + try: + with open(CONFIG_PATH) as f: + saved = yaml.safe_load(f) or {} + cfg = _deep_merge(cfg, saved) + log.debug("Loaded config from %s", CONFIG_PATH) + except Exception: + log.warning("Failed to read config; using defaults", exc_info=True) + return cfg + + +def save(cfg: dict[str, Any]) -> None: + """Write config to disk.""" + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + with open(CONFIG_PATH, "w") as f: + yaml.safe_dump(cfg, f, default_flow_style=False) + log.info("Config saved to %s", CONFIG_PATH) + + +def exists() -> bool: + return CONFIG_PATH.exists() diff --git a/calliope/hotkeys.py b/calliope/hotkeys.py new file mode 100644 index 0000000..b61eed7 --- /dev/null +++ b/calliope/hotkeys.py @@ -0,0 +1,106 @@ +"""Global hotkey listener using pynput.""" + +import logging +from typing import Callable + +from pynput import keyboard + +log = logging.getLogger(__name__) + +# Maps string names to pynput keys +_KEY_MAP: dict[str, keyboard.Key] = { + "ctrl": keyboard.Key.ctrl, + "shift": keyboard.Key.shift, + "alt": keyboard.Key.alt, + "cmd": keyboard.Key.cmd, + "space": keyboard.Key.space, +} + + +def _parse_combo(combo: str) -> set[keyboard.Key]: + """Parse 'ctrl+shift' into a set of pynput keys.""" + keys: set[keyboard.Key] = set() + for part in combo.lower().split("+"): + part = part.strip() + if part in _KEY_MAP: + keys.add(_KEY_MAP[part]) + else: + log.warning("Unknown key in combo: %s", part) + return keys + + +class HotkeyListener: + def __init__( + self, + on_push_to_talk_start: Callable, + on_push_to_talk_stop: Callable, + on_toggle: Callable, + ptt_combo: str = "ctrl+shift", + toggle_combo: str = "ctrl+space", + ): + self._on_ptt_start = on_push_to_talk_start + self._on_ptt_stop = on_push_to_talk_stop + self._on_toggle = on_toggle + self._listener: keyboard.Listener | None = None + self._pressed: set = set() + self._ptt_active = False + self._toggle_active = False + + self._ptt_keys = _parse_combo(ptt_combo) + self._toggle_keys = _parse_combo(toggle_combo) + log.debug("PTT keys: %s, Toggle keys: %s", self._ptt_keys, self._toggle_keys) + + def start(self) -> None: + self._pressed.clear() + self._ptt_active = False + self._toggle_active = False + self._listener = keyboard.Listener( + on_press=self._on_press, + on_release=self._on_release, + ) + self._listener.daemon = True + self._listener.start() + + def stop(self) -> None: + if self._listener is not None: + try: + self._listener.stop() + except Exception: + pass + self._listener = None + self._pressed.clear() + self._ptt_active = False + self._toggle_active = False + + def _normalize(self, key) -> keyboard.Key | keyboard.KeyCode: + if hasattr(key, "value") and hasattr(key.value, "vk"): + vk = key.value.vk + if vk in (0x3B, 0x3E): + return keyboard.Key.ctrl + if vk in (0x38, 0x3C): + return keyboard.Key.shift + return key + + def _on_press(self, key) -> None: + key = self._normalize(key) + self._pressed.add(key) + + if self._ptt_keys.issubset(self._pressed) and not self._ptt_active: + self._ptt_active = True + self._on_ptt_start() + + if self._toggle_keys.issubset(self._pressed) and not self._toggle_active: + self._toggle_active = True + self._on_toggle() + + def _on_release(self, key) -> None: + key = self._normalize(key) + + if self._ptt_active and key in self._ptt_keys: + self._ptt_active = False + self._on_ptt_stop() + + if key in self._toggle_keys: + self._toggle_active = False + + self._pressed.discard(key) diff --git a/calliope/overlay.py b/calliope/overlay.py new file mode 100644 index 0000000..b54fb13 --- /dev/null +++ b/calliope/overlay.py @@ -0,0 +1,313 @@ +"""Floating waveform overlay shown during recording.""" + +import logging +import time +from collections import deque +from enum import Enum, auto + +import numpy as np + +from AppKit import ( + NSPanel, + NSView, + NSColor, + NSBezierPath, + NSTimer, + NSScreen, + NSWindowStyleMaskBorderless, + NSWindowStyleMaskNonactivatingPanel, + NSFloatingWindowLevel, + NSStatusWindowLevel, + NSBackingStoreBuffered, + NSApp, + NSFont, + NSFontAttributeName, + NSForegroundColorAttributeName, + NSMakePoint, +) +from Foundation import NSMakeRect +from objc import super as objc_super +from PyObjCTools.AppHelper import callAfter + +log = logging.getLogger(__name__) + +WIDTH = 360 +HEIGHT = 80 +NUM_BARS = 150 # number of amplitude samples visible at once +FPS = 30 + +# Fade animation +FADE_DURATION = 0.2 # seconds +FADE_STEPS = int(FADE_DURATION * FPS) + + +class OverlayMode(Enum): + RECORDING = auto() + TRANSCRIBING = auto() + + +class WaveformView(NSView): + """Custom NSView that draws a scrolling waveform or transcribing indicator.""" + + amplitudes: deque + mode: OverlayMode + _pulse_start: float + _fade_step: int + _fade_direction: int + _fade_timer: object + _on_fade_complete: object + + def initWithFrame_(self, frame): + self = objc_super(WaveformView, self).initWithFrame_(frame) + if self is None: + return None + self.amplitudes = deque([0.0] * NUM_BARS, maxlen=NUM_BARS) + self.mode = OverlayMode.RECORDING + self._pulse_start = time.monotonic() + self._fade_step = 0 + self._fade_direction = 0 + self._fade_timer = None + self._on_fade_complete = None + return self + + def drawRect_(self, rect): + # Dark translucent rounded-rect background + bg = NSColor.colorWithCalibratedRed_green_blue_alpha_(0.1, 0.1, 0.1, 0.85) + bg.setFill() + path = NSBezierPath.bezierPathWithRoundedRect_xRadius_yRadius_( + self.bounds(), 12, 12 + ) + path.fill() + + # Subtle border + border = NSColor.colorWithCalibratedRed_green_blue_alpha_(1.0, 1.0, 1.0, 0.12) + border.setStroke() + border_path = NSBezierPath.bezierPathWithRoundedRect_xRadius_yRadius_( + self.bounds(), 12, 12 + ) + border_path.setLineWidth_(1.0) + border_path.stroke() + + if self.mode == OverlayMode.RECORDING: + self._draw_waveform() + elif self.mode == OverlayMode.TRANSCRIBING: + self._draw_transcribing() + + def _draw_waveform(self): + color = NSColor.colorWithCalibratedRed_green_blue_alpha_(0.4, 0.75, 0.5, 0.9) + color.setStroke() + + bounds = self.bounds() + w = bounds.size.width + h = bounds.size.height + mid_y = h / 2 + padding = 10 + draw_w = w - 2 * padding + draw_h = (h - 2 * padding) / 2 + + amps = list(self.amplitudes) + if not amps: + return + + step = draw_w / max(len(amps) - 1, 1) + + for sign in (1, -1): + line = NSBezierPath.bezierPath() + line.setLineWidth_(1.5) + for i, a in enumerate(amps): + x = padding + i * step + y_off = a * draw_h * sign + if i == 0: + line.moveToPoint_((x, mid_y + y_off)) + else: + line.lineToPoint_((x, mid_y + y_off)) + line.stroke() + + self._draw_label("calliope recording...") + + def _draw_transcribing(self): + bounds = self.bounds() + w = bounds.size.width + h = bounds.size.height + mid_y = h / 2 + + # Pulsing dots animation + elapsed = time.monotonic() - self._pulse_start + num_dots = 3 + dot_radius = 5.0 + dot_spacing = 20.0 + total_w = (num_dots - 1) * dot_spacing + start_x = (w - total_w) / 2 + + for i in range(num_dots): + # Staggered sine pulse for each dot + phase = elapsed * 3.0 - i * 0.6 + alpha = 0.3 + 0.7 * max(0.0, (1.0 + np.sin(phase)) / 2.0) + color = NSColor.colorWithCalibratedRed_green_blue_alpha_( + 0.4, 0.75, 0.5, alpha + ) + color.setFill() + x = start_x + i * dot_spacing + dot = NSBezierPath.bezierPathWithOvalInRect_( + NSMakeRect(x - dot_radius, mid_y - dot_radius + 6, + dot_radius * 2, dot_radius * 2) + ) + dot.fill() + + self._draw_label("transcribing...") + + def _draw_label(self, text: str): + from Foundation import NSString, NSDictionary + + bounds = self.bounds() + w = bounds.size.width + label = NSString.stringWithString_(text) + attrs = NSDictionary.dictionaryWithObjects_forKeys_( + [ + NSFont.systemFontOfSize_(11), + NSColor.colorWithCalibratedRed_green_blue_alpha_(1.0, 1.0, 1.0, 0.5), + ], + [NSFontAttributeName, NSForegroundColorAttributeName], + ) + label_size = label.sizeWithAttributes_(attrs) + label_x = (w - label_size.width) / 2 + label.drawAtPoint_withAttributes_(NSMakePoint(label_x, 4), attrs) + + def refresh_(self, timer): + self.setNeedsDisplay_(True) + + def fadeTick_(self, timer): + self._fade_step += 1 + progress = min(self._fade_step / FADE_STEPS, 1.0) + + if self._fade_direction == 1: + alpha = progress + else: + alpha = 1.0 - progress + + self.window().setAlphaValue_(alpha) + + if progress >= 1.0: + self.stopFade() + if self._fade_direction == -1 and self._on_fade_complete: + self._on_fade_complete() + + def stopFade(self): + if self._fade_timer is not None: + self._fade_timer.invalidate() + self._fade_timer = None + + def startFade_onComplete_(self, direction, on_complete): + self.stopFade() + self._fade_direction = direction + self._fade_step = 0 + self._on_fade_complete = on_complete + self._fade_timer = NSTimer.scheduledTimerWithTimeInterval_target_selector_userInfo_repeats_( + FADE_DURATION / FADE_STEPS, self, b"fadeTick:", None, True + ) + + +class WaveformOverlay: + """Floating translucent window showing a live scrolling waveform.""" + + def __init__(self): + self._panel: NSPanel | None = None + self._view: WaveformView | None = None + self._timer: NSTimer | None = None + + def _ensure_panel(self): + if self._panel is not None: + return + + screen = NSScreen.mainScreen() + screen_frame = screen.frame() + x = (screen_frame.size.width - WIDTH) / 2 + y = screen_frame.size.height - HEIGHT - 40 # near top, below menu bar + + rect = NSMakeRect(x, y, WIDTH, HEIGHT) + style = NSWindowStyleMaskBorderless | NSWindowStyleMaskNonactivatingPanel + panel = NSPanel.alloc().initWithContentRect_styleMask_backing_defer_( + rect, style, NSBackingStoreBuffered, False + ) + panel.setLevel_(NSStatusWindowLevel) + panel.setOpaque_(False) + panel.setBackgroundColor_(NSColor.clearColor()) + panel.setHasShadow_(True) + panel.setIgnoresMouseEvents_(True) + panel.setCollectionBehavior_(1 << 4) # NSWindowCollectionBehaviorCanJoinAllSpaces + + view = WaveformView.alloc().initWithFrame_(NSMakeRect(0, 0, WIDTH, HEIGHT)) + panel.setContentView_(view) + + self._panel = panel + self._view = view + + def show(self): + callAfter(self._show_on_main) + + def hide(self): + callAfter(self._hide_on_main) + + def show_transcribing(self): + """Switch overlay to transcribing state (pulsing dots).""" + callAfter(self._show_transcribing_on_main) + + def _show_on_main(self): + self._ensure_panel() + self._view.stopFade() + self._view.mode = OverlayMode.RECORDING + self._view.amplitudes = deque([0.0] * NUM_BARS, maxlen=NUM_BARS) + self._panel.setAlphaValue_(0.0) + self._panel.orderFront_(None) + self._start_timer() + self._view.startFade_onComplete_(1, None) + log.debug("Overlay shown") + + def _show_transcribing_on_main(self): + self._ensure_panel() + self._view.stopFade() + self._view.mode = OverlayMode.TRANSCRIBING + self._view._pulse_start = time.monotonic() + # If panel is already visible, just switch mode; otherwise show it + if self._panel.alphaValue() < 0.01: + self._panel.setAlphaValue_(0.0) + self._panel.orderFront_(None) + self._view.startFade_onComplete_(1, None) + self._start_timer() + log.debug("Overlay switched to transcribing") + + def _hide_on_main(self): + if self._view is None or self._panel is None: + return + def on_fade_out(): + self._stop_timer() + self._panel.orderOut_(None) + self._view.startFade_onComplete_(-1, on_fade_out) + log.debug("Overlay hiding") + + def cleanup(self): + """Synchronously stop all timers and hide. Call before quit.""" + self._stop_timer() + if self._view is not None: + self._view.stopFade() + if self._panel is not None: + self._panel.orderOut_(None) + + def push_samples(self, chunk: np.ndarray): + """Called from audio callback with a new chunk of float32 samples.""" + rms = float(np.sqrt(np.mean(chunk ** 2))) + # Clamp to [0, 1] with some headroom + amplitude = min(rms * 5.0, 1.0) + if self._view is not None: + self._view.amplitudes.append(amplitude) + + def _start_timer(self): + self._stop_timer() + self._timer = NSTimer.scheduledTimerWithTimeInterval_target_selector_userInfo_repeats_( + 1.0 / FPS, self._view, b"refresh:", None, True + ) + + def _stop_timer(self): + if self._timer is not None: + self._timer.invalidate() + self._timer = None diff --git a/calliope/recorder.py b/calliope/recorder.py new file mode 100644 index 0000000..f11425d --- /dev/null +++ b/calliope/recorder.py @@ -0,0 +1,88 @@ +"""Audio recording using sounddevice.""" + +import logging + +import numpy as np +import sounddevice as sd +import threading + +log = logging.getLogger(__name__) + +_CONSOLIDATE_EVERY = 100 + + +class Recorder: + SAMPLE_RATE = 16_000 + CHANNELS = 1 + + def __init__(self, device: int | None = None, on_audio=None): + self._device = device + self._chunks: list[np.ndarray] = [] + self._stream: sd.InputStream | None = None + self._lock = threading.Lock() + self._chunk_count = 0 + self.on_audio = on_audio + + @property + def is_recording(self) -> bool: + return self._stream is not None and self._stream.active + + def start(self) -> None: + with self._lock: + self._chunks = [] + self._chunk_count = 0 + try: + self._stream = sd.InputStream( + samplerate=self.SAMPLE_RATE, + channels=self.CHANNELS, + dtype="float32", + device=self._device, + callback=self._callback, + ) + self._stream.start() + log.debug("Recording stream started (device=%s)", self._device) + except sd.PortAudioError: + log.error("Failed to open audio device %s", self._device, exc_info=True) + if self._stream is not None: + try: + self._stream.close() + except Exception: + pass + self._stream = None + raise + + def stop(self) -> np.ndarray: + # Stop stream first — guarantees no more callbacks after this returns. + if self._stream is not None: + self._stream.stop() + self._stream.close() + self._stream = None + with self._lock: + if not self._chunks: + return np.zeros(0, dtype=np.float32) + audio = np.concatenate(self._chunks).flatten() + self._chunks = [] + return audio + + def get_audio_so_far(self) -> np.ndarray: + """Return a copy of all audio recorded so far without stopping the stream.""" + with self._lock: + if not self._chunks: + return np.zeros(0, dtype=np.float32) + return np.concatenate(self._chunks).flatten() + + def _callback(self, indata: np.ndarray, frames, time_info, status) -> None: + if status: + log.warning("Audio stream status: %s", status) + chunk = indata[:, 0].copy() if indata.ndim > 1 else indata.copy() + with self._lock: + self._chunks.append(chunk) + self._chunk_count += 1 + if self._chunk_count % _CONSOLIDATE_EVERY == 0: + self._chunks = [np.concatenate(self._chunks).flatten()] + + if self.on_audio is not None: + try: + self.on_audio(chunk) + except Exception: + log.error("Error in on_audio callback", exc_info=True) diff --git a/calliope/setup_wizard.py b/calliope/setup_wizard.py new file mode 100644 index 0000000..186ab6a --- /dev/null +++ b/calliope/setup_wizard.py @@ -0,0 +1,147 @@ +"""First-run setup wizard — Rich TUI.""" + +import subprocess +import sys + +import sounddevice as sd +from rich.console import Console +from rich.panel import Panel +from rich.progress import Progress +from rich.prompt import Confirm, IntPrompt, Prompt +from rich.table import Table + +from calliope import config + +console = Console() + + +def run() -> dict: + """Run the interactive setup wizard and return the final config.""" + cfg = dict(config.DEFAULTS) + + # ── Welcome ────────────────────────────────────────────────────── + console.print( + Panel.fit( + "[bold magenta]Calliope[/bold magenta]\n" + "Voice-to-text for macOS — speak and type into any app.\n\n" + "This wizard will walk you through first-time setup.", + border_style="magenta", + ) + ) + + # ── Permission checks ──────────────────────────────────────────── + console.print("\n[bold]Permission checks[/bold]") + _check_accessibility() + _check_microphone() + console.print() + + # ── Mic selection ──────────────────────────────────────────────── + console.print("[bold]Microphone selection[/bold]") + devices = sd.query_devices() + table = Table(show_header=True) + table.add_column("#", style="cyan", width=4) + table.add_column("Device") + table.add_column("Inputs", justify="right") + + input_indices: list[int] = [] + for i, d in enumerate(devices): + if d["max_input_channels"] > 0: + input_indices.append(i) + marker = " (default)" if i == sd.default.device[0] else "" + table.add_row(str(i), f"{d['name']}{marker}", str(d["max_input_channels"])) + + console.print(table) + default_dev = sd.default.device[0] + choice = Prompt.ask( + "Device index", + default=str(default_dev) if default_dev is not None else str(input_indices[0]), + ) + cfg["device"] = int(choice) if choice else None + + # ── Hotkey config ──────────────────────────────────────────────── + console.print("\n[bold]Hotkey configuration[/bold]") + console.print(f" Push-to-talk : [cyan]{cfg['hotkeys']['ptt']}[/cyan]") + console.print(f" Toggle : [cyan]{cfg['hotkeys']['toggle']}[/cyan]") + if Confirm.ask("Keep defaults?", default=True): + pass + else: + cfg["hotkeys"]["ptt"] = Prompt.ask("Push-to-talk combo", default=cfg["hotkeys"]["ptt"]) + cfg["hotkeys"]["toggle"] = Prompt.ask("Toggle combo", default=cfg["hotkeys"]["toggle"]) + + # ── Model download ─────────────────────────────────────────────── + console.print("\n[bold]Model download[/bold]") + console.print(f" Default model: [cyan]{cfg['model']}[/cyan]") + if not Confirm.ask("Use default model?", default=True): + cfg["model"] = Prompt.ask("Whisper model") + console.print(f"Downloading [cyan]{cfg['model']}[/cyan] (this may take a while)...") + + from calliope.transcriber import Transcriber + + transcriber = Transcriber(model=cfg["model"]) + with Progress() as progress: + task = progress.add_task("Loading model...", total=None) + transcriber.load() + progress.update(task, completed=100, total=100) + + console.print("[green]Model ready.[/green]") + + # ── Validation ─────────────────────────────────────────────────── + if Confirm.ask("\nRecord a short test clip to verify everything works?", default=True): + console.print("Recording for 3 seconds...") + from calliope.recorder import Recorder + import time + + rec = Recorder(device=cfg["device"]) + rec.start() + time.sleep(3) + audio = rec.stop() + console.print("Transcribing...") + text = transcriber.transcribe(audio) + console.print(f"[green]Result:[/green] {text or '(no speech detected)'}") + + # ── Save ───────────────────────────────────────────────────────── + config.save(cfg) + console.print(f"\n[green]Config saved to {config.CONFIG_PATH}[/green]") + console.print("Run [bold]calliope[/bold] to start. Enjoy! 🎤\n") + return cfg + + +def _check_accessibility() -> None: + try: + import ApplicationServices + trusted = ApplicationServices.AXIsProcessTrusted() + except Exception: + trusted = None + + if trusted: + console.print(" [green]✓[/green] Accessibility access granted") + else: + console.print(" [red]✗[/red] Accessibility access — required for typing") + console.print(" Open: System Settings → Privacy & Security → Accessibility") + if Confirm.ask(" Open System Settings?", default=False): + subprocess.run( + ["open", "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility"], + check=False, + ) + + +def _check_microphone() -> None: + try: + import AVFoundation + status = AVFoundation.AVCaptureDevice.authorizationStatusForMediaType_( + AVFoundation.AVMediaTypeAudio + ) + granted = status == 3 # AVAuthorizationStatusAuthorized + except Exception: + granted = None + + if granted: + console.print(" [green]✓[/green] Microphone access granted") + else: + console.print(" [red]✗[/red] Microphone access — required for recording") + console.print(" Open: System Settings → Privacy & Security → Microphone") + if Confirm.ask(" Open System Settings?", default=False): + subprocess.run( + ["open", "x-apple.systempreferences:com.apple.preference.security?Privacy_Microphone"], + check=False, + ) diff --git a/calliope/transcriber.py b/calliope/transcriber.py new file mode 100644 index 0000000..0cda782 --- /dev/null +++ b/calliope/transcriber.py @@ -0,0 +1,73 @@ +"""Whisper transcription using transformers pipeline on MPS.""" + +import logging + +import numpy as np +import torch +from transformers import pipeline + +log = logging.getLogger(__name__) + + +class Transcriber: + def __init__(self, model: str = "distil-whisper/distil-large-v3"): + self.model = model + self._pipe = None + self._tokenizer = None + self.context: str = "" + self.language: str = "auto" + + def load(self) -> None: + from transformers import AutoTokenizer + + device = "mps" if torch.backends.mps.is_available() else "cpu" + # Use float32 on MPS — float16 produces garbled output on Apple Silicon. + dtype = torch.float32 if device == "mps" else torch.float16 + log.info("Loading model %s on %s (dtype=%s)", self.model, device, dtype) + try: + self._pipe = pipeline( + "automatic-speech-recognition", + model=self.model, + torch_dtype=dtype, + device=device, + ) + self._tokenizer = AutoTokenizer.from_pretrained(self.model) + log.info("Model loaded successfully") + except Exception: + log.error("Failed to load model %s", self.model, exc_info=True) + raise + + def transcribe(self, audio: np.ndarray) -> str: + if self._pipe is None: + self.load() + if audio.size == 0: + return "" + + # Skip audio that's too short (<1s) or too quiet — Whisper hallucinates + # punctuation like "!" on silence/noise. + duration = audio.size / 16_000 + energy = float(np.sqrt(np.mean(audio ** 2))) + log.debug("Audio: %.1fs, RMS energy: %.6f", duration, energy) + if duration < 1.0 or energy < 0.005: + log.debug("Audio too short or too quiet, skipping transcription") + return "" + + generate_kwargs = {} + if self.context: + prompt_ids = self._tokenizer.get_prompt_ids(self.context) + generate_kwargs["prompt_ids"] = prompt_ids + + pipe_kwargs = { + "batch_size": 4, + "return_timestamps": True, + "generate_kwargs": generate_kwargs, + } + if self.language != "auto": + pipe_kwargs["generate_kwargs"]["language"] = self.language + pipe_kwargs["generate_kwargs"]["task"] = "transcribe" + + result = self._pipe( + {"raw": audio, "sampling_rate": 16_000}, + **pipe_kwargs, + ) + return result["text"].strip() diff --git a/calliope/typer.py b/calliope/typer.py new file mode 100644 index 0000000..d7e05df --- /dev/null +++ b/calliope/typer.py @@ -0,0 +1,64 @@ +"""Type text into the focused field using Quartz CGEvents.""" + +import logging +import subprocess +import time + +import Quartz + +log = logging.getLogger(__name__) + + +def type_text(text: str) -> None: + """Simulate typing text into the currently focused text field.""" + for char in text: + _type_char(char) + time.sleep(0.005) + + +def type_text_clipboard(text: str) -> None: + """Type text by copying to clipboard and pasting with Cmd+V. + + Saves and restores the previous clipboard contents. + """ + # Save current clipboard + try: + prev = subprocess.run( + ["pbpaste"], capture_output=True, text=True, timeout=2, + ).stdout + except Exception: + prev = None + + # Copy text to clipboard + subprocess.run(["pbcopy"], input=text, text=True, timeout=2) + + # Paste with Cmd+V + _cmd_v() + time.sleep(0.05) + + # Restore previous clipboard + if prev is not None: + subprocess.run(["pbcopy"], input=prev, text=True, timeout=2) + + +def _cmd_v() -> None: + """Simulate Cmd+V keypress.""" + # 'v' keycode is 9 + event_down = Quartz.CGEventCreateKeyboardEvent(None, 9, True) + event_up = Quartz.CGEventCreateKeyboardEvent(None, 9, False) + Quartz.CGEventSetFlags(event_down, Quartz.kCGEventFlagMaskCommand) + Quartz.CGEventSetFlags(event_up, Quartz.kCGEventFlagMaskCommand) + Quartz.CGEventPost(Quartz.kCGAnnotatedSessionEventTap, event_down) + Quartz.CGEventPost(Quartz.kCGAnnotatedSessionEventTap, event_up) + + +def _type_char(char: str) -> None: + """Type a single unicode character via CGEvents.""" + event_down = Quartz.CGEventCreateKeyboardEvent(None, 0, True) + event_up = Quartz.CGEventCreateKeyboardEvent(None, 0, False) + + Quartz.CGEventKeyboardSetUnicodeString(event_down, len(char), char) + Quartz.CGEventKeyboardSetUnicodeString(event_up, len(char), char) + + Quartz.CGEventPost(Quartz.kCGAnnotatedSessionEventTap, event_down) + Quartz.CGEventPost(Quartz.kCGAnnotatedSessionEventTap, event_up) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..08a8760 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["setuptools>=68.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "calliope" +version = "0.1.0" +description = "Voice-to-text for macOS — speak and type into any app" +requires-python = ">=3.10" +dependencies = [ + "rumps>=0.4.0", + "sounddevice>=0.4.6", + "numpy>=1.24.0", + "torch>=2.0.0", + "transformers>=4.36.0", + "accelerate>=0.25.0", + "pynput>=1.7.6", + "pyobjc-framework-Quartz>=9.0", + "pyobjc-framework-Cocoa>=9.0", + "pyobjc-framework-AVFoundation>=9.0", + "rich>=13.0.0", + "click>=8.1.0", + "pyyaml>=6.0", +] + +[project.scripts] +calliope = "calliope.app:main"