Last active
August 26, 2025 14:05
-
-
Save kokoye2007/5530370a6c165e981321d2c8f3e262f2 to your computer and use it in GitHub Desktop.
Keymagic for Normalization | Reorder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import pandas as pd | |
| def to_codepoints(s: str) -> str: | |
| """Convert Myanmar text to Unicode codepoints with plus signs.""" | |
| return " + ".join(f"U{ord(ch):04X}" for ch in s) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Convert Myanmar text pairs to Unicode codepoints mapping") | |
| parser.add_argument("-i", "--input", required=True, help="Input CSV file (2 columns: source,target)") | |
| parser.add_argument("-o", "--output", required=True, help="Output TXT file") | |
| args = parser.parse_args() | |
| # Load CSV (expect 2 columns, no header) | |
| df = pd.read_csv(args.input, header=None, names=["source", "target"]) | |
| # Convert and prepare lines | |
| lines = [] | |
| for _, row in df.iterrows(): | |
| src = str(row["source"]) | |
| tgt = str(row["target"]) | |
| src_cp = to_codepoints(src) | |
| tgt_cp = to_codepoints(tgt) | |
| lines.append(f"{src_cp} => {tgt_cp} // {src} | {tgt}") | |
| # Write to TXT | |
| with open(args.output, "w", encoding="utf-8") as f: | |
| f.write("\n".join(lines)) | |
| print(f"✅ Conversion complete. Output saved to {args.output}") | |
| if __name__ == "__main__": | |
| main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment