Skip to content

Instantly share code, notes, and snippets.

@cumulus13
Created December 22, 2025 08:55
Show Gist options
  • Select an option

  • Save cumulus13/444ab06ffc7a2d0ec83b79cf70c06c03 to your computer and use it in GitHub Desktop.

Select an option

Save cumulus13/444ab06ffc7a2d0ec83b79cf70c06c03 to your computer and use it in GitHub Desktop.
Detection human language except for english
#!/usr/bin/env python3
# File: langdet.py
# Author: Hadi Cahyadi <cumulus13@gmail.com>
# Date: 2025-12-22
# Description: Detection human language except for english
# License: MIT
import sys
import os
import argparse
import logging
from pathlib import Path
from typing import List, Optional, Tuple, Dict
from dataclasses import dataclass
import langid
from rich.console import Console
from rich.table import Table
from rich.progress import track
try:
from licface import CustomRichHelpFormatter
except ImportError:
CustomRichHelpFormatter = argparse.RawTextHelpFormatter
exceptions=['langid']
if len(sys.argv) > 1 and any('--debug' == arg for arg in sys.argv):
print("🐞 Debug mode enabled")
os.environ["DEBUG"] = "1"
os.environ['LOGGING'] = "1"
os.environ.pop('NO_LOGGING', None)
os.environ['TRACEBACK'] = "1"
os.environ["LOGGING"] = "1"
LOG_LEVEL = "DEBUG"
else:
os.environ['NO_LOGGING'] = "1"
LOG_LEVEL = "DEBUG"
try:
from richcolorlog import setup_logging # type: ignore
logger = setup_logging(__name__, level=LOG_LEVEL, exceptions=exceptions)
except:
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
console = Console(width=os.get_terminal_size().columns)
@dataclass
class DetectionResult:
"""Data class for detection results"""
line_num: int
line_content: str
language: str
confidence: float
def __str__(self) -> str:
return f"Line {self.line_num}: {self.language} (confidence: {self.confidence:.2f})"
class LanguageDetector:
"""
Advanced language detector with configurable options.
Attributes:
min_confidence: Minimum confidence threshold (0.0-1.0)
chunk_size: Number of characters to analyze per detection
languages: Set langid to detect only specific languages
"""
def __init__(
self,
min_confidence: float = 0.0,
chunk_size: int = 50, # Reduced from 100 for better detection
target_languages: Optional[List[str]] = None
):
"""
Initialize the detector.
Args:
min_confidence: Minimum confidence score to report (0.0-1.0)
chunk_size: Minimum text length for reliable detection
target_languages: List of ISO 639-1 codes to restrict detection to
"""
self.min_confidence = max(0.0, min(1.0, min_confidence))
self.chunk_size = max(5, chunk_size) # Lowered minimum
# Configure langid for better accuracy
if target_languages:
langid.set_languages(target_languages)
logger.info(f"Detector initialized with min_confidence={self.min_confidence}")
def _should_check_line(self, line: str, patterns: List[str]) -> bool:
"""
Check if line contains patterns that warrant language detection.
Args:
line: Text line to check
patterns: List of patterns to search for
Returns:
True if line should be checked
"""
if not line.strip():
return False
# Check for specified patterns
return any(pattern in line for pattern in patterns)
def _clean_text(self, text: str) -> str:
"""
Clean text for better detection accuracy.
Args:
text: Raw text
Returns:
Cleaned text
"""
# Remove common code patterns that interfere with detection
cleaned = text.strip()
# Remove string delimiters but keep content
for delimiter in ['"', "'", '`']:
if cleaned.startswith(delimiter) and cleaned.endswith(delimiter):
cleaned = cleaned[1:-1]
# Remove common comment markers
for marker in ['//', '#', '/*', '*/']:
cleaned = cleaned.replace(marker, ' ')
return cleaned.strip()
def detect_line(self, line: str, line_num: int = 0, debug: bool = False) -> Optional[DetectionResult]:
"""
Detect language in a single line.
Args:
line: Text line to analyze
line_num: Line number for reporting
debug: Print debug info
Returns:
DetectionResult or None if not detected/below threshold
"""
cleaned = self._clean_text(line)
if debug:
logger.debug(f"Line {line_num}: Original='{line.strip()}'")
logger.debug(f"Line {line_num}: Cleaned='{cleaned}' (len={len(cleaned)})")
# Skip if text is too short for reliable detection
if len(cleaned) < self.chunk_size:
if debug:
logger.debug(f"Line {line_num}: Skipped - too short ({len(cleaned)} < {self.chunk_size})")
return None
try:
lang, confidence = langid.classify(cleaned)
if debug:
logger.debug(f"Line {line_num}: Detected lang={lang}, confidence={confidence:.2f}")
# Filter by confidence and exclude English
if confidence >= self.min_confidence and lang != 'en':
return DetectionResult(
line_num=line_num,
line_content=line.strip(),
language=lang,
confidence=confidence
)
elif debug:
reason = "is English" if lang == 'en' else f"low confidence ({confidence:.2f} < {self.min_confidence})"
logger.debug(f"Line {line_num}: Filtered - {reason}")
except Exception as e:
logger.warning(f"Detection failed for line {line_num}: {e}")
return None
def detect_file(
self,
filepath: Path,
patterns: Optional[List[str]] = None,
include_langs: Optional[List[str]] = None,
exclude_langs: Optional[List[str]] = None,
show_progress: bool = True,
debug: bool = False
) -> List[DetectionResult]:
"""
Detect non-English languages in a file.
Args:
filepath: Path to file to analyze
patterns: Patterns to search for (e.g., ['"', '//'])
include_langs: Only report these languages
exclude_langs: Don't report these languages
show_progress: Show progress bar
debug: Enable debug mode
Returns:
List of DetectionResult objects
"""
if not filepath.exists():
raise FileNotFoundError(f"File not found: {filepath}")
patterns = patterns or ['"', "//", "#", "/*"]
results = []
try:
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
lines = f.readlines()
iterator = track(enumerate(lines, 1), total=len(lines),
description="Analyzing...", console=console) if show_progress else enumerate(lines, 1)
for line_num, line in iterator:
if not self._should_check_line(line, patterns):
if debug:
logger.debug(f"Line {line_num}: Skipped - no matching patterns")
continue
result = self.detect_line(line, line_num, debug)
if result:
# Apply include/exclude filters
if include_langs and result.language not in include_langs:
if debug:
logger.debug(f"Line {line_num}: Filtered - lang '{result.language}' not in include list")
continue
if exclude_langs and result.language in exclude_langs:
if debug:
logger.debug(f"Line {line_num}: Filtered - lang '{result.language}' in exclude list")
continue
results.append(result)
except UnicodeDecodeError as e:
logger.error(f"Encoding error reading file: {e}")
raise
except Exception as e:
logger.error(f"Error processing file: {e}")
raise
return results
def get_statistics(self, results: List[DetectionResult]) -> Dict[str, int]:
"""
Get language statistics from results.
Args:
results: List of detection results
Returns:
Dictionary mapping language codes to counts
"""
stats = {}
for result in results:
stats[result.language] = stats.get(result.language, 0) + 1
return stats
def print_results(results: List[DetectionResult], show_stats: bool = True):
"""
Print detection results in a formatted table.
Args:
results: List of DetectionResult objects
show_stats: Whether to show statistics summary
"""
if not results:
console.print("[yellow]No non-English text detected.[/yellow]")
return
table = Table(title="Language Detection Results", width=os.get_terminal_size().columns)
table.add_column("Line", style="cyan", justify="right")
table.add_column("Language", style="yellow")
table.add_column("Confidence", style="magenta")
table.add_column("Content", style="white", max_width=60)
for result in results:
table.add_row(
str(result.line_num),
result.language,
f"{result.confidence:.2%}",
result.line_content[:60] + "..." if len(result.line_content) > 60 else result.line_content
)
console.print(table)
if show_stats:
detector = LanguageDetector()
stats = detector.get_statistics(results)
console.print("\n[bold]Language Statistics:[/bold]")
for lang, count in sorted(stats.items(), key=lambda x: x[1], reverse=True):
console.print(f" {lang}: {count} occurrence(s)")
def main():
"""Main entry point for CLI"""
parser = argparse.ArgumentParser(
description="Detect non-English languages in text files",
formatter_class=CustomRichHelpFormatter,
prog='langdet'
)
parser.add_argument(
'filepath',
type=str,
help='Path to file to analyze',
nargs="?"
)
parser.add_argument(
'-c', '--confidence',
type=float,
default=0.0,
help='Minimum confidence threshold (0.0-1.0, default: 0.0)'
)
parser.add_argument(
'-s', '--chunk-size',
type=int,
default=50, # Reduced default for better detection
help='Minimum text length for detection (default: 50)'
)
parser.add_argument(
'-p', '--patterns',
nargs='+',
default=['"', '//', '#'],
help='Patterns to search for (default: " // #)'
)
parser.add_argument(
'-i', '--include',
nargs='+',
help='Only detect these languages (ISO 639-1 codes)'
)
parser.add_argument(
'-e', '--exclude',
nargs='+',
help='Exclude these languages (ISO 639-1 codes)'
)
parser.add_argument(
'--no-progress',
action='store_true',
help='Disable progress bar'
)
parser.add_argument(
'-ns', '--no-stats',
action='store_true',
help='Disable statistics summary'
)
parser.add_argument(
'-v', '--verbose',
action='store_true',
help='Enable verbose logging'
)
args = parser.parse_args()
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
if not args.filepath:
console.print("[red]Error: Filepath is required.[/red]")
sys.exit(1)
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
try:
filepath = Path(args.filepath)
detector = LanguageDetector(
min_confidence=args.confidence,
chunk_size=args.chunk_size
)
results = detector.detect_file(
filepath=filepath,
patterns=args.patterns,
include_langs=args.include,
exclude_langs=args.exclude,
show_progress=not args.no_progress,
debug=args.verbose
)
print_results(results, show_stats=not args.no_stats)
# Exit code based on findings
sys.exit(0 if not results else 1)
except FileNotFoundError as e:
console.print(f"[red]Error: {e}[/red]")
sys.exit(2)
except Exception as e:
console.print(f"[red]Unexpected error: {e}[/red]")
logger.exception("Unexpected error occurred")
sys.exit(3)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment