cumulus13 · December 22, 2025 08:55
diff --git a/langdet.py b/langdet.py
 #!/usr/bin/env python3
 # File: langdet.py
 # Author: Hadi Cahyadi <cumulus13@gmail.com>
 # Date: 2025-12-22
 # Description: Detection human language except for english
 # License: MIT

 import sys
 import os
 import argparse
 import logging
 from pathlib import Path
 from typing import List, Optional, Tuple, Dict
 from dataclasses import dataclass
 import langid
 from rich.console import Console
 from rich.table import Table
 from rich.progress import track

 try:
    from licface import CustomRichHelpFormatter
 except ImportError:
    CustomRichHelpFormatter = argparse.RawTextHelpFormatter

 exceptions=['langid']
 if len(sys.argv) > 1 and any('--debug' == arg for arg in sys.argv):
    print("🐞 Debug mode enabled")
    os.environ["DEBUG"] = "1"
    os.environ['LOGGING'] = "1"
    os.environ.pop('NO_LOGGING', None)
    os.environ['TRACEBACK'] = "1"
    os.environ["LOGGING"] = "1"
    LOG_LEVEL = "DEBUG"
 else:
    os.environ['NO_LOGGING'] = "1"
    LOG_LEVEL = "DEBUG"

 try:
    from richcolorlog import setup_logging  # type: ignore
    logger = setup_logging(__name__, level=LOG_LEVEL, exceptions=exceptions)

 except:
    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)

 console = Console(width=os.get_terminal_size().columns)


 @dataclass
 class DetectionResult:
    """Data class for detection results"""
    line_num: int
    line_content: str
    language: str
    confidence: float
    
    def __str__(self) -> str:
        return f"Line {self.line_num}: {self.language} (confidence: {self.confidence:.2f})"


 class LanguageDetector:
    """
    Advanced language detector with configurable options.
    
    Attributes:
        min_confidence: Minimum confidence threshold (0.0-1.0)
        chunk_size: Number of characters to analyze per detection
        languages: Set langid to detect only specific languages
    """
    
    def __init__(
        self, 
        min_confidence: float = 0.0,
        chunk_size: int = 50,  # Reduced from 100 for better detection
        target_languages: Optional[List[str]] = None
    ):
        """
        Initialize the detector.
        
        Args:
            min_confidence: Minimum confidence score to report (0.0-1.0)
            chunk_size: Minimum text length for reliable detection
            target_languages: List of ISO 639-1 codes to restrict detection to
        """
        self.min_confidence = max(0.0, min(1.0, min_confidence))
        self.chunk_size = max(5, chunk_size)  # Lowered minimum
        
        # Configure langid for better accuracy
        if target_languages:
            langid.set_languages(target_languages)
        
        logger.info(f"Detector initialized with min_confidence={self.min_confidence}")
    
    def _should_check_line(self, line: str, patterns: List[str]) -> bool:
        """
        Check if line contains patterns that warrant language detection.
        
        Args:
            line: Text line to check
            patterns: List of patterns to search for
            
        Returns:
            True if line should be checked
        """
        if not line.strip():
            return False
        
        # Check for specified patterns
        return any(pattern in line for pattern in patterns)
    
    def _clean_text(self, text: str) -> str:
        """
        Clean text for better detection accuracy.
        
        Args:
            text: Raw text
            
        Returns:
            Cleaned text
        """
        # Remove common code patterns that interfere with detection
        cleaned = text.strip()
        
        # Remove string delimiters but keep content
        for delimiter in ['"', "'", '`']:
            if cleaned.startswith(delimiter) and cleaned.endswith(delimiter):
                cleaned = cleaned[1:-1]
        
        # Remove common comment markers
        for marker in ['//', '#', '/*', '*/']:
            cleaned = cleaned.replace(marker, ' ')
        
        return cleaned.strip()
    
    def detect_line(self, line: str, line_num: int = 0, debug: bool = False) -> Optional[DetectionResult]:
        """
        Detect language in a single line.
        
        Args:
            line: Text line to analyze
            line_num: Line number for reporting
            debug: Print debug info
            
        Returns:
            DetectionResult or None if not detected/below threshold
        """
        cleaned = self._clean_text(line)
        
        if debug:
            logger.debug(f"Line {line_num}: Original='{line.strip()}'")
            logger.debug(f"Line {line_num}: Cleaned='{cleaned}' (len={len(cleaned)})")
        
        # Skip if text is too short for reliable detection
        if len(cleaned) < self.chunk_size:
            if debug:
                logger.debug(f"Line {line_num}: Skipped - too short ({len(cleaned)} < {self.chunk_size})")
            return None
        
        try:
            lang, confidence = langid.classify(cleaned)
            
            if debug:
                logger.debug(f"Line {line_num}: Detected lang={lang}, confidence={confidence:.2f}")
            
            # Filter by confidence and exclude English
            if confidence >= self.min_confidence and lang != 'en':
                return DetectionResult(
                    line_num=line_num,
                    line_content=line.strip(),
                    language=lang,
                    confidence=confidence
                )
            elif debug:
                reason = "is English" if lang == 'en' else f"low confidence ({confidence:.2f} < {self.min_confidence})"
                logger.debug(f"Line {line_num}: Filtered - {reason}")
        except Exception as e:
            logger.warning(f"Detection failed for line {line_num}: {e}")
        
        return None
    
    def detect_file(
        self,
        filepath: Path,
        patterns: Optional[List[str]] = None,
        include_langs: Optional[List[str]] = None,
        exclude_langs: Optional[List[str]] = None,
        show_progress: bool = True,
        debug: bool = False
    ) -> List[DetectionResult]:
        """
        Detect non-English languages in a file.
        
        Args:
            filepath: Path to file to analyze
            patterns: Patterns to search for (e.g., ['"', '//'])
            include_langs: Only report these languages
            exclude_langs: Don't report these languages
            show_progress: Show progress bar
            debug: Enable debug mode
            
        Returns:
            List of DetectionResult objects
        """
        if not filepath.exists():
            raise FileNotFoundError(f"File not found: {filepath}")
        
        patterns = patterns or ['"', "//", "#", "/*"]
        results = []
        
        try:
            with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
                lines = f.readlines()
                
            iterator = track(enumerate(lines, 1), total=len(lines), 
                           description="Analyzing...", console=console) if show_progress else enumerate(lines, 1)
            
            for line_num, line in iterator:
                if not self._should_check_line(line, patterns):
                    if debug:
                        logger.debug(f"Line {line_num}: Skipped - no matching patterns")
                    continue
                
                result = self.detect_line(line, line_num, debug)
                if result:
                    # Apply include/exclude filters
                    if include_langs and result.language not in include_langs:
                        if debug:
                            logger.debug(f"Line {line_num}: Filtered - lang '{result.language}' not in include list")
                        continue
                    if exclude_langs and result.language in exclude_langs:
                        if debug:
                            logger.debug(f"Line {line_num}: Filtered - lang '{result.language}' in exclude list")
                        continue
                    
                    results.append(result)
                    
        except UnicodeDecodeError as e:
            logger.error(f"Encoding error reading file: {e}")
            raise
        except Exception as e:
            logger.error(f"Error processing file: {e}")
            raise
        
        return results
    
    def get_statistics(self, results: List[DetectionResult]) -> Dict[str, int]:
        """
        Get language statistics from results.
        
        Args:
            results: List of detection results
            
        Returns:
            Dictionary mapping language codes to counts
        """
        stats = {}
        for result in results:
            stats[result.language] = stats.get(result.language, 0) + 1
        return stats


 def print_results(results: List[DetectionResult], show_stats: bool = True):
    """
    Print detection results in a formatted table.
    
    Args:
        results: List of DetectionResult objects
        show_stats: Whether to show statistics summary
    """
    if not results:
        console.print("[yellow]No non-English text detected.[/yellow]")
        return
    
    table = Table(title="Language Detection Results", width=os.get_terminal_size().columns)
    table.add_column("Line", style="cyan", justify="right")
    table.add_column("Language", style="yellow")
    table.add_column("Confidence", style="magenta")
    table.add_column("Content", style="white", max_width=60)
    
    for result in results:
        table.add_row(
            str(result.line_num),
            result.language,
            f"{result.confidence:.2%}",
            result.line_content[:60] + "..." if len(result.line_content) > 60 else result.line_content
        )
    
    console.print(table)
    
    if show_stats:
        detector = LanguageDetector()
        stats = detector.get_statistics(results)
        
        console.print("\n[bold]Language Statistics:[/bold]")
        for lang, count in sorted(stats.items(), key=lambda x: x[1], reverse=True):
            console.print(f"  {lang}: {count} occurrence(s)")


 def main():
    """Main entry point for CLI"""
    parser = argparse.ArgumentParser(
        description="Detect non-English languages in text files",
        formatter_class=CustomRichHelpFormatter,
        prog='langdet'
    )
    parser.add_argument(
        'filepath',
        type=str,
        help='Path to file to analyze',
        nargs="?"
    )
    parser.add_argument(
        '-c', '--confidence',
        type=float,
        default=0.0,
        help='Minimum confidence threshold (0.0-1.0, default: 0.0)'
    )
    parser.add_argument(
        '-s', '--chunk-size',
        type=int,
        default=50,  # Reduced default for better detection
        help='Minimum text length for detection (default: 50)'
    )
    parser.add_argument(
        '-p', '--patterns',
        nargs='+',
        default=['"', '//', '#'],
        help='Patterns to search for (default: " // #)'
    )
    parser.add_argument(
        '-i', '--include',
        nargs='+',
        help='Only detect these languages (ISO 639-1 codes)'
    )
    parser.add_argument(
        '-e', '--exclude',
        nargs='+',
        help='Exclude these languages (ISO 639-1 codes)'
    )
    parser.add_argument(
        '--no-progress',
        action='store_true',
        help='Disable progress bar'
    )
    parser.add_argument(
        '-ns', '--no-stats',
        action='store_true',
        help='Disable statistics summary'
    )
    parser.add_argument(
        '-v', '--verbose',
        action='store_true',
        help='Enable verbose logging'
    )
    
    args = parser.parse_args()

    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)
    
    if not args.filepath:
        console.print("[red]Error: Filepath is required.[/red]")
        sys.exit(1)

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    
    try:
        filepath = Path(args.filepath)
        
        detector = LanguageDetector(
            min_confidence=args.confidence,
            chunk_size=args.chunk_size
        )
        
        results = detector.detect_file(
            filepath=filepath,
            patterns=args.patterns,
            include_langs=args.include,
            exclude_langs=args.exclude,
            show_progress=not args.no_progress,
            debug=args.verbose
        )
        
        print_results(results, show_stats=not args.no_stats)
        
        # Exit code based on findings
        sys.exit(0 if not results else 1)
        
    except FileNotFoundError as e:
        console.print(f"[red]Error: {e}[/red]")
        sys.exit(2)
    except Exception as e:
        console.print(f"[red]Unexpected error: {e}[/red]")
        logger.exception("Unexpected error occurred")
        sys.exit(3)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# File: langdet.py
	# Author: Hadi Cahyadi <cumulus13@gmail.com>
	# Date: 2025-12-22
	# Description: Detection human language except for english
	# License: MIT

	import sys
	import os
	import argparse
	import logging
	from pathlib import Path
	from typing import List, Optional, Tuple, Dict
	from dataclasses import dataclass
	import langid
	from rich.console import Console
	from rich.table import Table
	from rich.progress import track

	try:
	from licface import CustomRichHelpFormatter
	except ImportError:
	CustomRichHelpFormatter = argparse.RawTextHelpFormatter

	exceptions=['langid']
	if len(sys.argv) > 1 and any('--debug' == arg for arg in sys.argv):
	print("🐞 Debug mode enabled")
	os.environ["DEBUG"] = "1"
	os.environ['LOGGING'] = "1"
	os.environ.pop('NO_LOGGING', None)
	os.environ['TRACEBACK'] = "1"
	os.environ["LOGGING"] = "1"
	LOG_LEVEL = "DEBUG"
	else:
	os.environ['NO_LOGGING'] = "1"
	LOG_LEVEL = "DEBUG"

	try:
	from richcolorlog import setup_logging # type: ignore
	logger = setup_logging(__name__, level=LOG_LEVEL, exceptions=exceptions)

	except:
	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	console = Console(width=os.get_terminal_size().columns)


	@dataclass
	class DetectionResult:
	"""Data class for detection results"""
	line_num: int
	line_content: str
	language: str
	confidence: float

	def __str__(self) -> str:
	return f"Line {self.line_num}: {self.language} (confidence: {self.confidence:.2f})"


	class LanguageDetector:
	"""
	Advanced language detector with configurable options.

	Attributes:
	min_confidence: Minimum confidence threshold (0.0-1.0)
	chunk_size: Number of characters to analyze per detection
	languages: Set langid to detect only specific languages
	"""

	def __init__(
	self,
	min_confidence: float = 0.0,
	chunk_size: int = 50, # Reduced from 100 for better detection
	target_languages: Optional[List[str]] = None
	):
	"""
	Initialize the detector.

	Args:
	min_confidence: Minimum confidence score to report (0.0-1.0)
	chunk_size: Minimum text length for reliable detection
	target_languages: List of ISO 639-1 codes to restrict detection to
	"""
	self.min_confidence = max(0.0, min(1.0, min_confidence))
	self.chunk_size = max(5, chunk_size) # Lowered minimum

	# Configure langid for better accuracy
	if target_languages:
	langid.set_languages(target_languages)

	logger.info(f"Detector initialized with min_confidence={self.min_confidence}")

	def _should_check_line(self, line: str, patterns: List[str]) -> bool:
	"""
	Check if line contains patterns that warrant language detection.

	Args:
	line: Text line to check
	patterns: List of patterns to search for

	Returns:
	True if line should be checked
	"""
	if not line.strip():
	return False

	# Check for specified patterns
	return any(pattern in line for pattern in patterns)

	def _clean_text(self, text: str) -> str:
	"""
	Clean text for better detection accuracy.

	Args:
	text: Raw text

	Returns:
	Cleaned text
	"""
	# Remove common code patterns that interfere with detection
	cleaned = text.strip()

	# Remove string delimiters but keep content
	for delimiter in ['"', "'", '`']:
	if cleaned.startswith(delimiter) and cleaned.endswith(delimiter):
	cleaned = cleaned[1:-1]

	# Remove common comment markers
	for marker in ['//', '#', '/', '/']:
	cleaned = cleaned.replace(marker, ' ')

	return cleaned.strip()

	def detect_line(self, line: str, line_num: int = 0, debug: bool = False) -> Optional[DetectionResult]:
	"""
	Detect language in a single line.

	Args:
	line: Text line to analyze
	line_num: Line number for reporting
	debug: Print debug info

	Returns:
	DetectionResult or None if not detected/below threshold
	"""
	cleaned = self._clean_text(line)

	if debug:
	logger.debug(f"Line {line_num}: Original='{line.strip()}'")
	logger.debug(f"Line {line_num}: Cleaned='{cleaned}' (len={len(cleaned)})")

	# Skip if text is too short for reliable detection
	if len(cleaned) < self.chunk_size:
	if debug:
	logger.debug(f"Line {line_num}: Skipped - too short ({len(cleaned)} < {self.chunk_size})")
	return None

	try:
	lang, confidence = langid.classify(cleaned)

	if debug:
	logger.debug(f"Line {line_num}: Detected lang={lang}, confidence={confidence:.2f}")

	# Filter by confidence and exclude English
	if confidence >= self.min_confidence and lang != 'en':
	return DetectionResult(
	line_num=line_num,
	line_content=line.strip(),
	language=lang,
	confidence=confidence
	)
	elif debug:
	reason = "is English" if lang == 'en' else f"low confidence ({confidence:.2f} < {self.min_confidence})"
	logger.debug(f"Line {line_num}: Filtered - {reason}")
	except Exception as e:
	logger.warning(f"Detection failed for line {line_num}: {e}")

	return None

	def detect_file(
	self,
	filepath: Path,
	patterns: Optional[List[str]] = None,
	include_langs: Optional[List[str]] = None,
	exclude_langs: Optional[List[str]] = None,
	show_progress: bool = True,
	debug: bool = False
	) -> List[DetectionResult]:
	"""
	Detect non-English languages in a file.

	Args:
	filepath: Path to file to analyze
	patterns: Patterns to search for (e.g., ['"', '//'])
	include_langs: Only report these languages
	exclude_langs: Don't report these languages
	show_progress: Show progress bar
	debug: Enable debug mode

	Returns:
	List of DetectionResult objects
	"""
	if not filepath.exists():
	raise FileNotFoundError(f"File not found: {filepath}")

	patterns = patterns or ['"', "//", "#", "/*"]
	results = []

	try:
	with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
	lines = f.readlines()

	iterator = track(enumerate(lines, 1), total=len(lines),
	description="Analyzing...", console=console) if show_progress else enumerate(lines, 1)

	for line_num, line in iterator:
	if not self._should_check_line(line, patterns):
	if debug:
	logger.debug(f"Line {line_num}: Skipped - no matching patterns")
	continue

	result = self.detect_line(line, line_num, debug)
	if result:
	# Apply include/exclude filters
	if include_langs and result.language not in include_langs:
	if debug:
	logger.debug(f"Line {line_num}: Filtered - lang '{result.language}' not in include list")
	continue
	if exclude_langs and result.language in exclude_langs:
	if debug:
	logger.debug(f"Line {line_num}: Filtered - lang '{result.language}' in exclude list")
	continue

	results.append(result)

	except UnicodeDecodeError as e:
	logger.error(f"Encoding error reading file: {e}")
	raise
	except Exception as e:
	logger.error(f"Error processing file: {e}")
	raise

	return results

	def get_statistics(self, results: List[DetectionResult]) -> Dict[str, int]:
	"""
	Get language statistics from results.

	Args:
	results: List of detection results

	Returns:
	Dictionary mapping language codes to counts
	"""
	stats = {}
	for result in results:
	stats[result.language] = stats.get(result.language, 0) + 1
	return stats


	def print_results(results: List[DetectionResult], show_stats: bool = True):
	"""
	Print detection results in a formatted table.

	Args:
	results: List of DetectionResult objects
	show_stats: Whether to show statistics summary
	"""
	if not results:
	console.print("[yellow]No non-English text detected.[/yellow]")
	return

	table = Table(title="Language Detection Results", width=os.get_terminal_size().columns)
	table.add_column("Line", style="cyan", justify="right")
	table.add_column("Language", style="yellow")
	table.add_column("Confidence", style="magenta")
	table.add_column("Content", style="white", max_width=60)

	for result in results:
	table.add_row(
	str(result.line_num),
	result.language,
	f"{result.confidence:.2%}",
	result.line_content[:60] + "..." if len(result.line_content) > 60 else result.line_content
	)

	console.print(table)

	if show_stats:
	detector = LanguageDetector()
	stats = detector.get_statistics(results)

	console.print("\n[bold]Language Statistics:[/bold]")
	for lang, count in sorted(stats.items(), key=lambda x: x[1], reverse=True):
	console.print(f" {lang}: {count} occurrence(s)")


	def main():
	"""Main entry point for CLI"""
	parser = argparse.ArgumentParser(
	description="Detect non-English languages in text files",
	formatter_class=CustomRichHelpFormatter,
	prog='langdet'
	)
	parser.add_argument(
	'filepath',
	type=str,
	help='Path to file to analyze',
	nargs="?"
	)
	parser.add_argument(
	'-c', '--confidence',
	type=float,
	default=0.0,
	help='Minimum confidence threshold (0.0-1.0, default: 0.0)'
	)
	parser.add_argument(
	'-s', '--chunk-size',
	type=int,
	default=50, # Reduced default for better detection
	help='Minimum text length for detection (default: 50)'
	)
	parser.add_argument(
	'-p', '--patterns',
	nargs='+',
	default=['"', '//', '#'],
	help='Patterns to search for (default: " // #)'
	)
	parser.add_argument(
	'-i', '--include',
	nargs='+',
	help='Only detect these languages (ISO 639-1 codes)'
	)
	parser.add_argument(
	'-e', '--exclude',
	nargs='+',
	help='Exclude these languages (ISO 639-1 codes)'
	)
	parser.add_argument(
	'--no-progress',
	action='store_true',
	help='Disable progress bar'
	)
	parser.add_argument(
	'-ns', '--no-stats',
	action='store_true',
	help='Disable statistics summary'
	)
	parser.add_argument(
	'-v', '--verbose',
	action='store_true',
	help='Enable verbose logging'
	)

	args = parser.parse_args()

	if len(sys.argv) == 1:
	parser.print_help(sys.stderr)
	sys.exit(1)

	if not args.filepath:
	console.print("[red]Error: Filepath is required.[/red]")
	sys.exit(1)

	if args.verbose:
	logging.getLogger().setLevel(logging.DEBUG)

	try:
	filepath = Path(args.filepath)

	detector = LanguageDetector(
	min_confidence=args.confidence,
	chunk_size=args.chunk_size
	)

	results = detector.detect_file(
	filepath=filepath,
	patterns=args.patterns,
	include_langs=args.include,
	exclude_langs=args.exclude,
	show_progress=not args.no_progress,
	debug=args.verbose
	)

	print_results(results, show_stats=not args.no_stats)

	# Exit code based on findings
	sys.exit(0 if not results else 1)

	except FileNotFoundError as e:
	console.print(f"[red]Error: {e}[/red]")
	sys.exit(2)
	except Exception as e:
	console.print(f"[red]Unexpected error: {e}[/red]")
	logger.exception("Unexpected error occurred")
	sys.exit(3)


	if __name__ == "__main__":
	main()
No results found