the-solipsist · February 5, 2026 20:51
diff --git a/convert_to_beancount.py b/convert_to_beancount.py
 #!/usr/bin/env python3
 """
 Convert price entries to Beancount format.

 Usage: python convert_to_beancount.py prices.l > beancount_prices.bean

 The converter handles:
 - Currency symbol conversion (₹→INR, $→USD, £→GBP, €→EUR)
 - Removal of commas from prices
 - Quoted commodity names
 - Multiple input formats (CURRENCY PRICE or PRICE CURRENCY)
 - Commodity name normalization to meet Beancount rules:
  * Converts to uppercase
  * Prefixes with 'X' if starting with a number
  * Replaces invalid characters with underscores
  * Truncates to 24 characters max

 Examples:
  P 2020-03-07 "0P0000XV0N.BO" ₹5000.00  → 2020-03-07 price X0P0000XV0N.BO 5000.00 INR
  P 2021-01-22 Points ₹0.15               → 2021-01-22 price POINTS 0.15 INR
  P 2026-02-05 BTC USD 73,597.62          → 2026-02-05 price BTC 73597.62 USD
 """

 import sys
 import re

 # Currency symbol mapping
 CURRENCY_MAP = {
    # Original unique ones
    '₹': 'INR',      # Indian Rupee — unique
    '£': 'GBP',      # Pound Sterling — unique
    '€': 'EUR',      # Euro — unique
    '¥': 'JPY',      # Japanese Yen (primary association; CNY often uses ¥ in some contexts but we prioritize uniqueness)
    '₺': 'TRY',      # Turkish Lira — unique modern symbol
    '₽': 'RUB',      # Russian Ruble — unique
    '₪': 'ILS',      # Israeli New Shekel — unique
    '₩': 'KRW',      # South Korean Won — unique
    '฿': 'THB',      # Thai Baht — unique
    '₫': 'VND',      # Vietnamese Dong — unique
    '₴': 'UAH',      # Ukrainian Hryvnia — unique
    '₱': 'PHP',      # Philippine Peso — unique
    '₡': 'CRC',      # Costa Rican Colón — unique
    '₲': 'PYG',      # Paraguayan Guaraní — unique
    '﷼': 'SAR',      # Saudi Riyal (primary / most common unique use; some overlap but distinct in practice)
    'د.إ': 'AED',    # UAE Dirham — unique
    'S/': 'PEN',     # Peruvian Sol — unique
    'Kč': 'CZK',     # Czech Koruna — unique
    'zł': 'PLN',     # Polish Złoty — unique
    'lei': 'RON',    # Romanian Leu — unique
    'лв': 'BGN',     # Bulgarian Lev — unique
    '₸': 'KZT',      # Kazakhstani Tenge — unique
    '៛': 'KHR',      # Cambodian Riel — unique
    'ƒ': 'AWG',      # Aruban Florin — unique (also used historically for Netherlands Antillean guilder, now mostly AWG)
    '؋': 'AFN',      # Afghan Afghani — unique
    '₾': 'GEL',      # Georgian Lari — unique
    'B/.': 'PAB',    # Panamanian Balboa — unique
    'L$': 'LRD',     # Liberian Dollar (distinctive variant)
    'MT': 'MZN',     # Mozambican Metical (unique in this form)
    'R': 'ZAR',      # South African Rand — conventionally unique in most global contexts
    'Fr': 'CHF',     # Swiss Franc (primary and distinctive when written as Fr. or CHF)
 }

 def clean_price(price_str):
    """Remove commas and currency symbols from price strings."""
    # Remove commas
    price_str = price_str.replace(',', '')
    
    # Remove currency symbols
    for symbol in CURRENCY_MAP:
        price_str = price_str.replace(symbol, '')
    
    return price_str.strip()

 def normalize_commodity(commodity):
    """Normalize commodity name to meet Beancount rules.
    
    Beancount rules:
    - Must be all uppercase (A-Z)
    - 1-24 characters long
    - Must start and end with capital letters (not numbers)
    - Middle can include: A-Z, 0-9, ', ., _, -
    """
    # Remove quotes if present
    commodity = commodity.strip('"')
    
    # Convert to uppercase
    commodity = commodity.upper()
    
    # If it starts with a number, prefix with 'X'
    if commodity and commodity[0].isdigit():
        commodity = 'X' + commodity
    
    # Replace any invalid characters with underscores
    valid_chars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\'._-')
    commodity = ''.join(c if c in valid_chars else '_' for c in commodity)
    
    # Ensure it doesn't end with a number (though this is actually allowed in Beancount)
    # The spec says "start and end with capital letters or numbers" so ending with number is OK
    
    # Truncate to 24 characters if needed
    if len(commodity) > 24:
        commodity = commodity[:24]
    
    return commodity

 def convert_currency(currency_str):
    """Convert currency symbols to ISO codes."""
    currency_str = currency_str.strip().strip('"')
    
    # If it's a currency symbol, convert it
    if currency_str in CURRENCY_MAP:
        return CURRENCY_MAP[currency_str]
    
    # Normalize the currency name to meet Beancount rules
    return normalize_commodity(currency_str)

 def convert_line(line):
    """Convert a single price line to Beancount format."""
    line = line.strip()
    
    # Skip empty lines
    if not line:
        return None
    
    # Check if line starts with P
    if not line.startswith('P '):
        return None
    
    # Remove the 'P ' prefix
    line = line[2:].strip()
    
    # Split by whitespace, but respect quoted strings
    import shlex
    try:
        parts = shlex.split(line)
    except:
        # Fallback to simple split if shlex fails
        parts = line.split()
    
    if len(parts) < 3:
        return f"; ERROR: Not enough parts: {line}"
    
    date = parts[0]
    commodity_raw = parts[1]
    
    # Convert and normalize commodity
    commodity = normalize_commodity(convert_currency(commodity_raw))
    
    # The rest is price information
    remaining = parts[2:]
    
    # Handle different formats:
    # 1. COMMODITY PRICE (with currency symbol in price)
    # 2. COMMODITY PRICE CURRENCY
    # 3. COMMODITY "PRICE_COMMODITY" NUMBER
    
    if len(remaining) == 1:
        # Single token: price with embedded currency symbol or plain number
        price_str = remaining[0]
        
        # Check if it contains a currency symbol
        found_currency = None
        for symbol, code in CURRENCY_MAP.items():
            if symbol in price_str:
                found_currency = code
                break
        
        if found_currency:
            price_value = clean_price(price_str)
            price_currency = found_currency
        else:
            # Plain number, assume UNKNOWN or keep as is
            price_value = clean_price(price_str)
            price_currency = "UNKNOWN"
    
    elif len(remaining) == 2:
        # Two tokens: either CURRENCY PRICE or "TARGET_COMMODITY" NUMBER
        first_token = remaining[0]
        second_token = remaining[1]
        
        # Try to determine which token is the price (numeric) and which is currency
        first_is_number = False
        second_is_number = False
        
        try:
            float(clean_price(first_token))
            first_is_number = True
        except:
            pass
        
        try:
            float(clean_price(second_token))
            second_is_number = True
        except:
            pass
        
        if first_is_number and not second_is_number:
            # Format: COMMODITY PRICE CURRENCY (e.g., BTC 73597.62 USD)
            price_value = clean_price(first_token)
            price_currency = convert_currency(second_token)
        elif not first_is_number and second_is_number:
            # Format: COMMODITY CURRENCY PRICE (e.g., BTC USD 73597.62)
            price_currency = convert_currency(first_token)
            price_value = clean_price(second_token)
        elif first_is_number and second_is_number:
            # Both are numbers - this is the "COMMODITY" NUMBER format
            # e.g., "GOLD916" 10 or "0P0000XV0N.BO" 1.0000
            # In this case, first token should have been the target commodity
            # This shouldn't happen with our current parsing, but handle it
            price_value = second_token
            price_currency = first_token
        else:
            # Neither is clearly a number - treat as PRICE CURRENCY
            price_value = clean_price(first_token)
            price_currency = convert_currency(second_token)
    
    else:
        return f"; ERROR: Too many parts: {line}"
    
    # Convert to Beancount format
    return f"{date} price {commodity} {price_value} {price_currency}"

 def main():
    if len(sys.argv) > 1:
        filename = sys.argv[1]
        with open(filename, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    else:
        lines = sys.stdin.readlines()
    
    for line in lines:
        converted = convert_line(line)
        if converted:
            print(converted)

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	"""
	Convert price entries to Beancount format.

	Usage: python convert_to_beancount.py prices.l > beancount_prices.bean

	The converter handles:
	- Currency symbol conversion (₹→INR, $→USD, £→GBP, €→EUR)
	- Removal of commas from prices
	- Quoted commodity names
	- Multiple input formats (CURRENCY PRICE or PRICE CURRENCY)
	- Commodity name normalization to meet Beancount rules:
	* Converts to uppercase
	* Prefixes with 'X' if starting with a number
	* Replaces invalid characters with underscores
	* Truncates to 24 characters max

	Examples:
	P 2020-03-07 "0P0000XV0N.BO" ₹5000.00 → 2020-03-07 price X0P0000XV0N.BO 5000.00 INR
	P 2021-01-22 Points ₹0.15 → 2021-01-22 price POINTS 0.15 INR
	P 2026-02-05 BTC USD 73,597.62 → 2026-02-05 price BTC 73597.62 USD
	"""

	import sys
	import re

	# Currency symbol mapping
	CURRENCY_MAP = {
	# Original unique ones
	'₹': 'INR', # Indian Rupee — unique
	'£': 'GBP', # Pound Sterling — unique
	'€': 'EUR', # Euro — unique
	'¥': 'JPY', # Japanese Yen (primary association; CNY often uses ¥ in some contexts but we prioritize uniqueness)
	'₺': 'TRY', # Turkish Lira — unique modern symbol
	'₽': 'RUB', # Russian Ruble — unique
	'₪': 'ILS', # Israeli New Shekel — unique
	'₩': 'KRW', # South Korean Won — unique
	'฿': 'THB', # Thai Baht — unique
	'₫': 'VND', # Vietnamese Dong — unique
	'₴': 'UAH', # Ukrainian Hryvnia — unique
	'₱': 'PHP', # Philippine Peso — unique
	'₡': 'CRC', # Costa Rican Colón — unique
	'₲': 'PYG', # Paraguayan Guaraní — unique
	'﷼': 'SAR', # Saudi Riyal (primary / most common unique use; some overlap but distinct in practice)
	'د.إ': 'AED', # UAE Dirham — unique
	'S/': 'PEN', # Peruvian Sol — unique
	'Kč': 'CZK', # Czech Koruna — unique
	'zł': 'PLN', # Polish Złoty — unique
	'lei': 'RON', # Romanian Leu — unique
	'лв': 'BGN', # Bulgarian Lev — unique
	'₸': 'KZT', # Kazakhstani Tenge — unique
	'៛': 'KHR', # Cambodian Riel — unique
	'ƒ': 'AWG', # Aruban Florin — unique (also used historically for Netherlands Antillean guilder, now mostly AWG)
	'؋': 'AFN', # Afghan Afghani — unique
	'₾': 'GEL', # Georgian Lari — unique
	'B/.': 'PAB', # Panamanian Balboa — unique
	'L$': 'LRD', # Liberian Dollar (distinctive variant)
	'MT': 'MZN', # Mozambican Metical (unique in this form)
	'R': 'ZAR', # South African Rand — conventionally unique in most global contexts
	'Fr': 'CHF', # Swiss Franc (primary and distinctive when written as Fr. or CHF)
	}

	def clean_price(price_str):
	"""Remove commas and currency symbols from price strings."""
	# Remove commas
	price_str = price_str.replace(',', '')

	# Remove currency symbols
	for symbol in CURRENCY_MAP:
	price_str = price_str.replace(symbol, '')

	return price_str.strip()

	def normalize_commodity(commodity):
	"""Normalize commodity name to meet Beancount rules.

	Beancount rules:
	- Must be all uppercase (A-Z)
	- 1-24 characters long
	- Must start and end with capital letters (not numbers)
	- Middle can include: A-Z, 0-9, ', ., _, -
	"""
	# Remove quotes if present
	commodity = commodity.strip('"')

	# Convert to uppercase
	commodity = commodity.upper()

	# If it starts with a number, prefix with 'X'
	if commodity and commodity[0].isdigit():
	commodity = 'X' + commodity

	# Replace any invalid characters with underscores
	valid_chars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\'._-')
	commodity = ''.join(c if c in valid_chars else '_' for c in commodity)

	# Ensure it doesn't end with a number (though this is actually allowed in Beancount)
	# The spec says "start and end with capital letters or numbers" so ending with number is OK

	# Truncate to 24 characters if needed
	if len(commodity) > 24:
	commodity = commodity[:24]

	return commodity

	def convert_currency(currency_str):
	"""Convert currency symbols to ISO codes."""
	currency_str = currency_str.strip().strip('"')

	# If it's a currency symbol, convert it
	if currency_str in CURRENCY_MAP:
	return CURRENCY_MAP[currency_str]

	# Normalize the currency name to meet Beancount rules
	return normalize_commodity(currency_str)

	def convert_line(line):
	"""Convert a single price line to Beancount format."""
	line = line.strip()

	# Skip empty lines
	if not line:
	return None

	# Check if line starts with P
	if not line.startswith('P '):
	return None

	# Remove the 'P ' prefix
	line = line[2:].strip()

	# Split by whitespace, but respect quoted strings
	import shlex
	try:
	parts = shlex.split(line)
	except:
	# Fallback to simple split if shlex fails
	parts = line.split()

	if len(parts) < 3:
	return f"; ERROR: Not enough parts: {line}"

	date = parts[0]
	commodity_raw = parts[1]

	# Convert and normalize commodity
	commodity = normalize_commodity(convert_currency(commodity_raw))

	# The rest is price information
	remaining = parts[2:]

	# Handle different formats:
	# 1. COMMODITY PRICE (with currency symbol in price)
	# 2. COMMODITY PRICE CURRENCY
	# 3. COMMODITY "PRICE_COMMODITY" NUMBER

	if len(remaining) == 1:
	# Single token: price with embedded currency symbol or plain number
	price_str = remaining[0]

	# Check if it contains a currency symbol
	found_currency = None
	for symbol, code in CURRENCY_MAP.items():
	if symbol in price_str:
	found_currency = code
	break

	if found_currency:
	price_value = clean_price(price_str)
	price_currency = found_currency
	else:
	# Plain number, assume UNKNOWN or keep as is
	price_value = clean_price(price_str)
	price_currency = "UNKNOWN"

	elif len(remaining) == 2:
	# Two tokens: either CURRENCY PRICE or "TARGET_COMMODITY" NUMBER
	first_token = remaining[0]
	second_token = remaining[1]

	# Try to determine which token is the price (numeric) and which is currency
	first_is_number = False
	second_is_number = False

	try:
	float(clean_price(first_token))
	first_is_number = True
	except:
	pass

	try:
	float(clean_price(second_token))
	second_is_number = True
	except:
	pass

	if first_is_number and not second_is_number:
	# Format: COMMODITY PRICE CURRENCY (e.g., BTC 73597.62 USD)
	price_value = clean_price(first_token)
	price_currency = convert_currency(second_token)
	elif not first_is_number and second_is_number:
	# Format: COMMODITY CURRENCY PRICE (e.g., BTC USD 73597.62)
	price_currency = convert_currency(first_token)
	price_value = clean_price(second_token)
	elif first_is_number and second_is_number:
	# Both are numbers - this is the "COMMODITY" NUMBER format
	# e.g., "GOLD916" 10 or "0P0000XV0N.BO" 1.0000
	# In this case, first token should have been the target commodity
	# This shouldn't happen with our current parsing, but handle it
	price_value = second_token
	price_currency = first_token
	else:
	# Neither is clearly a number - treat as PRICE CURRENCY
	price_value = clean_price(first_token)
	price_currency = convert_currency(second_token)

	else:
	return f"; ERROR: Too many parts: {line}"

	# Convert to Beancount format
	return f"{date} price {commodity} {price_value} {price_currency}"

	def main():
	if len(sys.argv) > 1:
	filename = sys.argv[1]
	with open(filename, 'r', encoding='utf-8') as f:
	lines = f.readlines()
	else:
	lines = sys.stdin.readlines()

	for line in lines:
	converted = convert_line(line)
	if converted:
	print(converted)

	if __name__ == '__main__':
	main()
No results found