Foadsf · February 2, 2026 21:27
diff --git a/README.md b/README.md
diff --git a/llama_engine.py b/llama_engine.py
 import subprocess
 import os
 import re

 class LlamaNPU:
    def __init__(self, model_dir="~/llama-npu"):
        """
        Initialize the NPU Wrapper.
        model_dir: Path where 'genie-t2t-run' and .json config are located.
        """
        self.model_dir = os.path.expanduser(model_dir)
        self.cmd_path = os.path.join(self.model_dir, "genie-t2t-run")
        self.config = os.path.join(self.model_dir, "htp-model-config-llama32-1b-gqa.json")
        
        # Validation
        if not os.path.exists(self.cmd_path):
            raise FileNotFoundError(f"Binary not found: {self.cmd_path}")
        if not os.path.exists(self.config):
            raise FileNotFoundError(f"Config not found: {self.config}")

    def generate(self, user_prompt):
        """
        Runs the NPU model with a constructed prompt.
        Returns the clean text response.
        """
        
        # Prompt Engineering for Llama 3
        # We enforce a "concise" system prompt to keep the 1B model focused.
        full_prompt = (
            f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
            f"You are a helpful voice assistant. Keep answers concise (under 2 sentences).<|eot_id|>"
            f"<|start_header_id|>user<|end_header_id|>\n\n{user_prompt}<|eot_id|>"
            f"<|start_header_id|>assistant<|end_header_id|>\n\n"
        )

        # Environment Setup
        # Crucial: Add model_dir to LD_LIBRARY_PATH so the binary finds its .so files
        env = os.environ.copy()
        current_ld = env.get("LD_LIBRARY_PATH", "")
        env["LD_LIBRARY_PATH"] = f"{self.model_dir}:{current_ld}"

        cmd = [self.cmd_path, "-c", self.config, "-p", full_prompt]

        try:
            # Execute the binary
            result = subprocess.run(
                cmd, 
                capture_output=True, 
                text=True, 
                env=env, 
                encoding='utf-8', 
                errors='replace'
            )
            
            # Post-processing: Extract text between [BEGIN] and [END] tags
            raw_output = result.stdout
            match = re.search(r'\[BEGIN\]:\s*(.*?)(?:\[END\]|$)', raw_output, re.DOTALL)
            
            if match:
                return match.group(1).strip()
            
            # Fallback if tags are missing
            return raw_output.strip()

        except Exception as e:
            return f"Error running NPU model: {str(e)}"

 if __name__ == "__main__":
    # Quick test
    llm = LlamaNPU()
    print("Test Answer:", llm.generate("What is the capital of France?"))
diff --git a/npu_voice_assistant.py b/npu_voice_assistant.py
 #!/usr/bin/env python3
 import os
 import sys
 import time
 import warnings
 import sounddevice as sd
 import soundfile as sf
 import numpy as np
 from transformers import pipeline

 # Import our custom NPU wrapper
 # Ensure this path matches where you saved llama_engine.py
 sys.path.append(os.getcwd())
 try:
    from llama_engine import LlamaNPU
 except ImportError:
    print("❌ Error: Could not find 'llama_engine.py'. Please create it first.")
    sys.exit(1)

 # --- Configuration ---
 SAMPLE_RATE = 16000
 DURATION = 5  # Seconds to record per turn
 LLAMA_PATH = "~/llama-npu"
 WHISPER_MODEL = "openai/whisper-tiny"  # 'tiny' is fast on CPU, 'small' is more accurate

 # Suppress HuggingFace warnings
 warnings.filterwarnings("ignore", category=FutureWarning)

 def main():
    print("\n=== Radxa NPU Hybrid Assistant ===")
    print("Initialize Engines...")
    
    # 1. Load Brain (NPU)
    print("   🧠 Llama 3.2 (NPU)...", end=" ", flush=True)
    try:
        llm = LlamaNPU(LLAMA_PATH)
        print("✓ Ready")
    except Exception as e:
        print(f"\n❌ Llama Failed: {e}")
        return

    # 2. Load Ears (CPU)
    print(f"   👂 Whisper ({WHISPER_MODEL} CPU)...", end=" ", flush=True)
    try:
        # We force CPU usage for reliability
        transcriber = pipeline("automatic-speech-recognition", model=WHISPER_MODEL, device="cpu")
        print("✓ Ready")
    except Exception as e:
        print(f"\n❌ Whisper Failed: {e}")
        return

    print("\n" + "="*40)
    print("  Ready! Press Enter to speak.")
    print("  (Ctrl+C to quit)")
    print("="*40)

    while True:
        try:
            input("\n[Press Enter] > ")
            
            # --- A. Record ---
            print("🔴 Recording...", end=" ", flush=True)
            recording = sd.rec(int(DURATION * SAMPLE_RATE), 
                             samplerate=SAMPLE_RATE, channels=1)
            sd.wait()
            
            # Save temp file for Whisper
            audio_file = "temp_voice.wav"
            sf.write(audio_file, recording, SAMPLE_RATE)
            print("Processing...", end=" ", flush=True)

            # --- B. Transcribe (CPU) ---
            # Suppress the language detection warning for cleaner output
            result = transcriber(audio_file, generate_kwargs={"language": "en"})
            user_text = result["text"].strip()
            
            print(f"\r🗣️  You: {user_text}")
            
            if len(user_text) < 2:
                print("   (No clear speech detected)")
                continue

            # --- C. Think (NPU) ---
            print("🤖 AI Thinking...", end=" ", flush=True)
            start_time = time.time()
            
            response = llm.generate(user_text)
            
            elapsed = time.time() - start_time
            print(f"\r🤖 AI: {response}")
            print(f"   (NPU Time: {elapsed:.2f}s)")

        except KeyboardInterrupt:
            print("\n\nExiting... Goodbye!")
            break
        except Exception as e:
            print(f"\nError: {e}")

 if __name__ == "__main__":
    main()
	import subprocess
	import os
	import re

	class LlamaNPU:
	def __init__(self, model_dir="~/llama-npu"):
	"""
	Initialize the NPU Wrapper.
	model_dir: Path where 'genie-t2t-run' and .json config are located.
	"""
	self.model_dir = os.path.expanduser(model_dir)
	self.cmd_path = os.path.join(self.model_dir, "genie-t2t-run")
	self.config = os.path.join(self.model_dir, "htp-model-config-llama32-1b-gqa.json")

	# Validation
	if not os.path.exists(self.cmd_path):
	raise FileNotFoundError(f"Binary not found: {self.cmd_path}")
	if not os.path.exists(self.config):
	raise FileNotFoundError(f"Config not found: {self.config}")

	def generate(self, user_prompt):
	"""
	Runs the NPU model with a constructed prompt.
	Returns the clean text response.
	"""

	# Prompt Engineering for Llama 3
	# We enforce a "concise" system prompt to keep the 1B model focused.
	full_prompt = (
	f"<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>\n\n"
	f"You are a helpful voice assistant. Keep answers concise (under 2 sentences).<\|eot_id\|>"
	f"<\|start_header_id\|>user<\|end_header_id\|>\n\n{user_prompt}<\|eot_id\|>"
	f"<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"
	)

	# Environment Setup
	# Crucial: Add model_dir to LD_LIBRARY_PATH so the binary finds its .so files
	env = os.environ.copy()
	current_ld = env.get("LD_LIBRARY_PATH", "")
	env["LD_LIBRARY_PATH"] = f"{self.model_dir}:{current_ld}"

	cmd = [self.cmd_path, "-c", self.config, "-p", full_prompt]

	try:
	# Execute the binary
	result = subprocess.run(
	cmd,
	capture_output=True,
	text=True,
	env=env,
	encoding='utf-8',
	errors='replace'
	)

	# Post-processing: Extract text between [BEGIN] and [END] tags
	raw_output = result.stdout
	match = re.search(r'\[BEGIN\]:\s(.?)(?:\[END\]\|$)', raw_output, re.DOTALL)

	if match:
	return match.group(1).strip()

	# Fallback if tags are missing
	return raw_output.strip()

	except Exception as e:
	return f"Error running NPU model: {str(e)}"

	if __name__ == "__main__":
	# Quick test
	llm = LlamaNPU()
	print("Test Answer:", llm.generate("What is the capital of France?"))
	#!/usr/bin/env python3
	import os
	import sys
	import time
	import warnings
	import sounddevice as sd
	import soundfile as sf
	import numpy as np
	from transformers import pipeline

	# Import our custom NPU wrapper
	# Ensure this path matches where you saved llama_engine.py
	sys.path.append(os.getcwd())
	try:
	from llama_engine import LlamaNPU
	except ImportError:
	print("❌ Error: Could not find 'llama_engine.py'. Please create it first.")
	sys.exit(1)

	# --- Configuration ---
	SAMPLE_RATE = 16000
	DURATION = 5 # Seconds to record per turn
	LLAMA_PATH = "~/llama-npu"
	WHISPER_MODEL = "openai/whisper-tiny" # 'tiny' is fast on CPU, 'small' is more accurate

	# Suppress HuggingFace warnings
	warnings.filterwarnings("ignore", category=FutureWarning)

	def main():
	print("\n=== Radxa NPU Hybrid Assistant ===")
	print("Initialize Engines...")

	# 1. Load Brain (NPU)
	print(" 🧠 Llama 3.2 (NPU)...", end=" ", flush=True)
	try:
	llm = LlamaNPU(LLAMA_PATH)
	print("✓ Ready")
	except Exception as e:
	print(f"\n❌ Llama Failed: {e}")
	return

	# 2. Load Ears (CPU)
	print(f" 👂 Whisper ({WHISPER_MODEL} CPU)...", end=" ", flush=True)
	try:
	# We force CPU usage for reliability
	transcriber = pipeline("automatic-speech-recognition", model=WHISPER_MODEL, device="cpu")
	print("✓ Ready")
	except Exception as e:
	print(f"\n❌ Whisper Failed: {e}")
	return

	print("\n" + "="*40)
	print(" Ready! Press Enter to speak.")
	print(" (Ctrl+C to quit)")
	print("="*40)

	while True:
	try:
	input("\n[Press Enter] > ")

	# --- A. Record ---
	print("🔴 Recording...", end=" ", flush=True)
	recording = sd.rec(int(DURATION * SAMPLE_RATE),
	samplerate=SAMPLE_RATE, channels=1)
	sd.wait()

	# Save temp file for Whisper
	audio_file = "temp_voice.wav"
	sf.write(audio_file, recording, SAMPLE_RATE)
	print("Processing...", end=" ", flush=True)

	# --- B. Transcribe (CPU) ---
	# Suppress the language detection warning for cleaner output
	result = transcriber(audio_file, generate_kwargs={"language": "en"})
	user_text = result["text"].strip()

	print(f"\r🗣️ You: {user_text}")

	if len(user_text) < 2:
	print(" (No clear speech detected)")
	continue

	# --- C. Think (NPU) ---
	print("🤖 AI Thinking...", end=" ", flush=True)
	start_time = time.time()

	response = llm.generate(user_text)

	elapsed = time.time() - start_time
	print(f"\r🤖 AI: {response}")
	print(f" (NPU Time: {elapsed:.2f}s)")

	except KeyboardInterrupt:
	print("\n\nExiting... Goodbye!")
	break
	except Exception as e:
	print(f"\nError: {e}")

	if __name__ == "__main__":
	main()