Created
December 22, 2025 08:55
-
-
Save cumulus13/444ab06ffc7a2d0ec83b79cf70c06c03 to your computer and use it in GitHub Desktop.
Detection human language except for english
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # File: langdet.py | |
| # Author: Hadi Cahyadi <cumulus13@gmail.com> | |
| # Date: 2025-12-22 | |
| # Description: Detection human language except for english | |
| # License: MIT | |
| import sys | |
| import os | |
| import argparse | |
| import logging | |
| from pathlib import Path | |
| from typing import List, Optional, Tuple, Dict | |
| from dataclasses import dataclass | |
| import langid | |
| from rich.console import Console | |
| from rich.table import Table | |
| from rich.progress import track | |
| try: | |
| from licface import CustomRichHelpFormatter | |
| except ImportError: | |
| CustomRichHelpFormatter = argparse.RawTextHelpFormatter | |
| exceptions=['langid'] | |
| if len(sys.argv) > 1 and any('--debug' == arg for arg in sys.argv): | |
| print("🐞 Debug mode enabled") | |
| os.environ["DEBUG"] = "1" | |
| os.environ['LOGGING'] = "1" | |
| os.environ.pop('NO_LOGGING', None) | |
| os.environ['TRACEBACK'] = "1" | |
| os.environ["LOGGING"] = "1" | |
| LOG_LEVEL = "DEBUG" | |
| else: | |
| os.environ['NO_LOGGING'] = "1" | |
| LOG_LEVEL = "DEBUG" | |
| try: | |
| from richcolorlog import setup_logging # type: ignore | |
| logger = setup_logging(__name__, level=LOG_LEVEL, exceptions=exceptions) | |
| except: | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| console = Console(width=os.get_terminal_size().columns) | |
| @dataclass | |
| class DetectionResult: | |
| """Data class for detection results""" | |
| line_num: int | |
| line_content: str | |
| language: str | |
| confidence: float | |
| def __str__(self) -> str: | |
| return f"Line {self.line_num}: {self.language} (confidence: {self.confidence:.2f})" | |
| class LanguageDetector: | |
| """ | |
| Advanced language detector with configurable options. | |
| Attributes: | |
| min_confidence: Minimum confidence threshold (0.0-1.0) | |
| chunk_size: Number of characters to analyze per detection | |
| languages: Set langid to detect only specific languages | |
| """ | |
| def __init__( | |
| self, | |
| min_confidence: float = 0.0, | |
| chunk_size: int = 50, # Reduced from 100 for better detection | |
| target_languages: Optional[List[str]] = None | |
| ): | |
| """ | |
| Initialize the detector. | |
| Args: | |
| min_confidence: Minimum confidence score to report (0.0-1.0) | |
| chunk_size: Minimum text length for reliable detection | |
| target_languages: List of ISO 639-1 codes to restrict detection to | |
| """ | |
| self.min_confidence = max(0.0, min(1.0, min_confidence)) | |
| self.chunk_size = max(5, chunk_size) # Lowered minimum | |
| # Configure langid for better accuracy | |
| if target_languages: | |
| langid.set_languages(target_languages) | |
| logger.info(f"Detector initialized with min_confidence={self.min_confidence}") | |
| def _should_check_line(self, line: str, patterns: List[str]) -> bool: | |
| """ | |
| Check if line contains patterns that warrant language detection. | |
| Args: | |
| line: Text line to check | |
| patterns: List of patterns to search for | |
| Returns: | |
| True if line should be checked | |
| """ | |
| if not line.strip(): | |
| return False | |
| # Check for specified patterns | |
| return any(pattern in line for pattern in patterns) | |
| def _clean_text(self, text: str) -> str: | |
| """ | |
| Clean text for better detection accuracy. | |
| Args: | |
| text: Raw text | |
| Returns: | |
| Cleaned text | |
| """ | |
| # Remove common code patterns that interfere with detection | |
| cleaned = text.strip() | |
| # Remove string delimiters but keep content | |
| for delimiter in ['"', "'", '`']: | |
| if cleaned.startswith(delimiter) and cleaned.endswith(delimiter): | |
| cleaned = cleaned[1:-1] | |
| # Remove common comment markers | |
| for marker in ['//', '#', '/*', '*/']: | |
| cleaned = cleaned.replace(marker, ' ') | |
| return cleaned.strip() | |
| def detect_line(self, line: str, line_num: int = 0, debug: bool = False) -> Optional[DetectionResult]: | |
| """ | |
| Detect language in a single line. | |
| Args: | |
| line: Text line to analyze | |
| line_num: Line number for reporting | |
| debug: Print debug info | |
| Returns: | |
| DetectionResult or None if not detected/below threshold | |
| """ | |
| cleaned = self._clean_text(line) | |
| if debug: | |
| logger.debug(f"Line {line_num}: Original='{line.strip()}'") | |
| logger.debug(f"Line {line_num}: Cleaned='{cleaned}' (len={len(cleaned)})") | |
| # Skip if text is too short for reliable detection | |
| if len(cleaned) < self.chunk_size: | |
| if debug: | |
| logger.debug(f"Line {line_num}: Skipped - too short ({len(cleaned)} < {self.chunk_size})") | |
| return None | |
| try: | |
| lang, confidence = langid.classify(cleaned) | |
| if debug: | |
| logger.debug(f"Line {line_num}: Detected lang={lang}, confidence={confidence:.2f}") | |
| # Filter by confidence and exclude English | |
| if confidence >= self.min_confidence and lang != 'en': | |
| return DetectionResult( | |
| line_num=line_num, | |
| line_content=line.strip(), | |
| language=lang, | |
| confidence=confidence | |
| ) | |
| elif debug: | |
| reason = "is English" if lang == 'en' else f"low confidence ({confidence:.2f} < {self.min_confidence})" | |
| logger.debug(f"Line {line_num}: Filtered - {reason}") | |
| except Exception as e: | |
| logger.warning(f"Detection failed for line {line_num}: {e}") | |
| return None | |
| def detect_file( | |
| self, | |
| filepath: Path, | |
| patterns: Optional[List[str]] = None, | |
| include_langs: Optional[List[str]] = None, | |
| exclude_langs: Optional[List[str]] = None, | |
| show_progress: bool = True, | |
| debug: bool = False | |
| ) -> List[DetectionResult]: | |
| """ | |
| Detect non-English languages in a file. | |
| Args: | |
| filepath: Path to file to analyze | |
| patterns: Patterns to search for (e.g., ['"', '//']) | |
| include_langs: Only report these languages | |
| exclude_langs: Don't report these languages | |
| show_progress: Show progress bar | |
| debug: Enable debug mode | |
| Returns: | |
| List of DetectionResult objects | |
| """ | |
| if not filepath.exists(): | |
| raise FileNotFoundError(f"File not found: {filepath}") | |
| patterns = patterns or ['"', "//", "#", "/*"] | |
| results = [] | |
| try: | |
| with open(filepath, 'r', encoding='utf-8', errors='replace') as f: | |
| lines = f.readlines() | |
| iterator = track(enumerate(lines, 1), total=len(lines), | |
| description="Analyzing...", console=console) if show_progress else enumerate(lines, 1) | |
| for line_num, line in iterator: | |
| if not self._should_check_line(line, patterns): | |
| if debug: | |
| logger.debug(f"Line {line_num}: Skipped - no matching patterns") | |
| continue | |
| result = self.detect_line(line, line_num, debug) | |
| if result: | |
| # Apply include/exclude filters | |
| if include_langs and result.language not in include_langs: | |
| if debug: | |
| logger.debug(f"Line {line_num}: Filtered - lang '{result.language}' not in include list") | |
| continue | |
| if exclude_langs and result.language in exclude_langs: | |
| if debug: | |
| logger.debug(f"Line {line_num}: Filtered - lang '{result.language}' in exclude list") | |
| continue | |
| results.append(result) | |
| except UnicodeDecodeError as e: | |
| logger.error(f"Encoding error reading file: {e}") | |
| raise | |
| except Exception as e: | |
| logger.error(f"Error processing file: {e}") | |
| raise | |
| return results | |
| def get_statistics(self, results: List[DetectionResult]) -> Dict[str, int]: | |
| """ | |
| Get language statistics from results. | |
| Args: | |
| results: List of detection results | |
| Returns: | |
| Dictionary mapping language codes to counts | |
| """ | |
| stats = {} | |
| for result in results: | |
| stats[result.language] = stats.get(result.language, 0) + 1 | |
| return stats | |
| def print_results(results: List[DetectionResult], show_stats: bool = True): | |
| """ | |
| Print detection results in a formatted table. | |
| Args: | |
| results: List of DetectionResult objects | |
| show_stats: Whether to show statistics summary | |
| """ | |
| if not results: | |
| console.print("[yellow]No non-English text detected.[/yellow]") | |
| return | |
| table = Table(title="Language Detection Results", width=os.get_terminal_size().columns) | |
| table.add_column("Line", style="cyan", justify="right") | |
| table.add_column("Language", style="yellow") | |
| table.add_column("Confidence", style="magenta") | |
| table.add_column("Content", style="white", max_width=60) | |
| for result in results: | |
| table.add_row( | |
| str(result.line_num), | |
| result.language, | |
| f"{result.confidence:.2%}", | |
| result.line_content[:60] + "..." if len(result.line_content) > 60 else result.line_content | |
| ) | |
| console.print(table) | |
| if show_stats: | |
| detector = LanguageDetector() | |
| stats = detector.get_statistics(results) | |
| console.print("\n[bold]Language Statistics:[/bold]") | |
| for lang, count in sorted(stats.items(), key=lambda x: x[1], reverse=True): | |
| console.print(f" {lang}: {count} occurrence(s)") | |
| def main(): | |
| """Main entry point for CLI""" | |
| parser = argparse.ArgumentParser( | |
| description="Detect non-English languages in text files", | |
| formatter_class=CustomRichHelpFormatter, | |
| prog='langdet' | |
| ) | |
| parser.add_argument( | |
| 'filepath', | |
| type=str, | |
| help='Path to file to analyze', | |
| nargs="?" | |
| ) | |
| parser.add_argument( | |
| '-c', '--confidence', | |
| type=float, | |
| default=0.0, | |
| help='Minimum confidence threshold (0.0-1.0, default: 0.0)' | |
| ) | |
| parser.add_argument( | |
| '-s', '--chunk-size', | |
| type=int, | |
| default=50, # Reduced default for better detection | |
| help='Minimum text length for detection (default: 50)' | |
| ) | |
| parser.add_argument( | |
| '-p', '--patterns', | |
| nargs='+', | |
| default=['"', '//', '#'], | |
| help='Patterns to search for (default: " // #)' | |
| ) | |
| parser.add_argument( | |
| '-i', '--include', | |
| nargs='+', | |
| help='Only detect these languages (ISO 639-1 codes)' | |
| ) | |
| parser.add_argument( | |
| '-e', '--exclude', | |
| nargs='+', | |
| help='Exclude these languages (ISO 639-1 codes)' | |
| ) | |
| parser.add_argument( | |
| '--no-progress', | |
| action='store_true', | |
| help='Disable progress bar' | |
| ) | |
| parser.add_argument( | |
| '-ns', '--no-stats', | |
| action='store_true', | |
| help='Disable statistics summary' | |
| ) | |
| parser.add_argument( | |
| '-v', '--verbose', | |
| action='store_true', | |
| help='Enable verbose logging' | |
| ) | |
| args = parser.parse_args() | |
| if len(sys.argv) == 1: | |
| parser.print_help(sys.stderr) | |
| sys.exit(1) | |
| if not args.filepath: | |
| console.print("[red]Error: Filepath is required.[/red]") | |
| sys.exit(1) | |
| if args.verbose: | |
| logging.getLogger().setLevel(logging.DEBUG) | |
| try: | |
| filepath = Path(args.filepath) | |
| detector = LanguageDetector( | |
| min_confidence=args.confidence, | |
| chunk_size=args.chunk_size | |
| ) | |
| results = detector.detect_file( | |
| filepath=filepath, | |
| patterns=args.patterns, | |
| include_langs=args.include, | |
| exclude_langs=args.exclude, | |
| show_progress=not args.no_progress, | |
| debug=args.verbose | |
| ) | |
| print_results(results, show_stats=not args.no_stats) | |
| # Exit code based on findings | |
| sys.exit(0 if not results else 1) | |
| except FileNotFoundError as e: | |
| console.print(f"[red]Error: {e}[/red]") | |
| sys.exit(2) | |
| except Exception as e: | |
| console.print(f"[red]Unexpected error: {e}[/red]") | |
| logger.exception("Unexpected error occurred") | |
| sys.exit(3) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment