Skip to content

Instantly share code, notes, and snippets.

@alshdavid
Last active January 7, 2026 13:45
Show Gist options
  • Select an option

  • Save alshdavid/7626bac4716bdcd52cf0a233ea205b6b to your computer and use it in GitHub Desktop.

Select an option

Save alshdavid/7626bac4716bdcd52cf0a233ea205b6b to your computer and use it in GitHub Desktop.
CBA PDF statement converter to PDF or CSV
"use strict";var t=this&&this.__importDefault||function(t){return t&&t.__esModule?t:{default:t}};Object.defineProperty(exports,"__esModule",{value:!0}),exports.parsePDF=void 0;const e=t(require("fs")),n=t(require("pdf-parse")),s=require("json2csv");async function r(t,r){const l=e.default.readFileSync(t),o=(await n.default(l)).text.split("TransactionDebitCreditBalance");o.shift();let a=[],c=0;for(let t of o){const{results:e,finalBalance:n}=i(t,c);c=n,a=[...a,...e]}if(r&&r.endsWith(".json"))e.default.writeFileSync(r,JSON.stringify(a,null,2));else if(r&&r.endsWith(".csv")){const t=s.parse(a);e.default.writeFileSync(r,t)}else console.log(a)}function i(t,e=0){const{openingBalance:n,firstPassResults:s}=function(t){const e=t.split("\n");let n;e.shift();const s=[];if(function(t){return t.includes("OPENING BALANCE")}(e[0])){n=l((e.shift()||"").split("$")[1])}let r=!1;for(const t of e)if(!0!==r){if(""===t)break;if(t.includes("CLOSING BALANCE"))break;t.includes("CREDIT INTEREST EARNED")?r=!0:s.push(t)}else r=!1;return{firstPassResults:s,openingBalance:n}}(t);return function(t,e){const n=[];let s=e;for(const e of t){const t=e.substring(0,6);let[r,...i]=e.substring(6,e.length).split("$");const c=o(i[0]),u=l(i[1]),f=u-s;let g=0;if(0===c){let e=a(f);e.startsWith("-")&&(e=e.substring(1,e.length));e!==r.substring(r.length-e.length,r.length)&&(console.log("WrongDebitCalc"),console.log({newBalance:u-Math.abs(f),diff:Math.abs(f),ds:e,date:t,label:r,debit:g,credit:c,balance:u})),g=Math.abs(f),r=r.substring(0,r.length-e.length)}n.push({date:t,label:r,debit:g,credit:c,balance:u}),s=u}return{results:n,finalBalance:s}}(function(t){const e=[];let n="";for(const s of t){1===s.split("$").length?n+=s:(e.push(n+s),n="")}return e}(s),n||e)}function l(t){if(t.includes("CR"))return o(t.split("CR")[0]);if(t.includes("DR"))return-Math.abs(o(t.split("DR")[0]));throw"No Balance"}function o(t){return t?parseInt(t.replace(/,/g,"").replace(/\./g,"")):0}function a(t){const e=t.toString().startsWith("-")?"-":"",n=t.toString().replace("-","");if(1===n.length)return e+"0.0"+n;if(2===n.length)return e+"0."+n;const s=n.substring(0,n.length-2),r=n.substring(n.length-2,n.length);return parseInt(s).toLocaleString()+"."+r}process.argv[2]&&r(process.argv[2],process.argv[3]),exports.parsePDF=r
Usage
Contact alshdavid@gmail.com for help
Download and install node from https://nodejs.org/en/download/
Copy the "index.js" script below to you computer.
Open up your terminal, cmd, or powershell.
run the following once:
npm install pdf-parse json2csv
Navigate to the directory then run the following for each statement:
node index.js ./statement.pdf ./output.csv
If you want to output a JSON file
node index.js ./statement.pdf ./output.json
import fs from 'fs'
import pdf from 'pdf-parse'
import { parse } from 'json2csv';
export type Cents = number
export type Record = {
date: string,
label: string,
debit: Cents,
credit: Cents,
balance: Cents
};
if (process.argv[2]) {
parsePDF(process.argv[2], process.argv[3])
}
export async function parsePDF(file: string, outputFile?: string){
const dataBuffer = fs.readFileSync(file);
const data = await pdf(dataBuffer)
const vs = data.text.split('TransactionDebitCreditBalance')
vs.shift()
let output: Record[] = []
let previousBalance = 0
for (let v of vs) {
const { results, finalBalance } = processTable(v, previousBalance)
previousBalance = finalBalance
output = [ ...output, ...results ]
}
if (outputFile && outputFile.endsWith('.json')) {
fs.writeFileSync(outputFile, JSON.stringify(output, null, 2))
} else if (outputFile && outputFile.endsWith('.csv')) {
const csv = parse(output);
fs.writeFileSync(outputFile, csv)
} else {
console.log(output)
}
}
function isOpeningBalance(target: string) {
return target.includes('OPENING BALANCE')
}
function getTabletLines(target: string): { firstPassResults: string[], openingBalance: number | undefined } {
const l = target.split('\n')
l.shift()
let openingBalance: number | undefined
const p = []
if (isOpeningBalance(l[0])) {
const openingLine = l.shift() || ''
openingBalance = parseBalance(openingLine.split('$')[1])
}
let skip = false
for (const line of l) {
if (skip === true) {
skip = false
continue
}
if (line === '') {
break
}
if (line.includes('CLOSING BALANCE')) {
break
}
if (line.includes('CREDIT INTEREST EARNED')) {
skip = true
continue
}
p.push(line)
}
return {
firstPassResults: p,
openingBalance
}
}
function squashLines(target: string[]): string[] {
const p2: string[] = []
let temp = ''
for (const line of target) {
const v = line.split('$')
if (v.length === 1) {
temp += line
} else {
p2.push(temp + line)
temp = ''
}
}
return p2
}
function generateRecords(target: string[], initialBalance: number): { results: Record[], finalBalance: Cents } {
const p3: Record[] = []
let previousBalance = initialBalance
for (const line of target) {
const date = line.substring(0, 6)
let [ label, ...data ] = line.substring(6, line.length).split('$')
const credit = moneyToCents(data[0])
const balance = parseBalance(data[1])
const diff = balance - previousBalance
let debit = 0
if (credit === 0) {
let ds = centsToMoney(diff)
if (ds.startsWith('-')) {
ds = ds.substring(1, ds.length)
}
const diffInLabel = label.substring(label.length - ds.length, label.length)
if (ds !== diffInLabel) {
console.log('WrongDebitCalc')
console.log({
newBalance: balance - Math.abs(diff),
diff: Math.abs(diff),
ds,
date,
label,
debit,
credit,
balance
})
}
debit = Math.abs(diff)
label = label.substring(0, label.length - ds.length)
}
p3.push({
date,
label,
debit,
credit,
balance
})
previousBalance = balance
}
return {
results: p3,
finalBalance: previousBalance
}
}
function processTable(t: string, initialBalance: Cents = 0) {
const { openingBalance, firstPassResults } = getTabletLines(t)
const squashedLines = squashLines(firstPassResults)
return generateRecords(squashedLines, openingBalance || initialBalance)
}
function parseBalance(balance: string): Cents {
if (balance.includes('CR')) {
return moneyToCents(balance.split('CR')[0])
} if (balance.includes('DR')) {
return -Math.abs(moneyToCents(balance.split('DR')[0]))
}
throw 'No Balance'
}
function moneyToCents(money: string): Cents {
return money ? parseInt(money.replace(/,/g, '').replace(/\./g, '')) : 0
}
function centsToMoney(input: Cents): string {
const negative = input.toString().startsWith('-') ? '-' : ''
const iS = input.toString().replace('-', '')
if (iS.length === 1) {
return negative + '0.0'+ iS
}
if (iS.length === 2) {
return negative + '0.'+ iS
}
const dollars = iS.substring(0, iS.length - 2)
const cents = iS.substring(iS.length - 2, iS.length)
const localDollars = parseInt(dollars).toLocaleString()
return localDollars + '.' + cents
}
@tareqni
Copy link

tareqni commented Sep 3, 2022

You are a legend mate!

@alshdavid
Copy link
Author

@mathiasrw
Copy link

mathiasrw commented Jan 7, 2026

I love your work. I could, however, not get it to work properly. I made this python script that solves the same problem.

The default output is .tsv that you can copy paste directly into a spreadsheet


CommBank Statement to Markdown/TSV Converter

Tool to extract transaction tables from Commonwealth Bank (CommBank, CBA) PDF bank statements and convert them into clean Markdown or TSV tables.

Features

  • Precise Data Extraction: Uses PDF vector graphics (yellow lines) to accurately identify row boundaries.
  • Smart Column Detection: Automatically separates Date, Transaction, Debit, Credit, and Balance fields based on their horizontal positions.
  • Narrow Output: Generates ultra-clean Markdown tables optimized for narrow displays.
  • Multiple Formats: Supports both Markdown (default) and TSV (Tab-Separated Values).
  • Cleans Data: Automatically removes parentheses, currency symbols (optional), and junk characters like Nil values representing empty cells.

Installation

Prerequisites

You will need Python 3.7+ installed.

Setup

  1. Create a virtual environment (recommended):

    python -m venv .venv
    
    # On macOS/Linux
    source .venv/bin/activate  
    
    # On Windows
    .venv\Scripts\activate     
  2. Install dependencies:

  3. pip install pdfplumber pandas

Usage

Run the script from your terminal:

Default tsv Output

python CBApdf2table.py input/Statements20190630.pdf > output.tsv

Markdown Output

python CBApdf2table.py --type md input/Statements20190630.pdf > output.md

How it Works

The script specifically looks for the yellow branding lines used in CommBank statements:

  1. It detects the Yellow Header to find the start of the transaction list.
  2. It uses subsequent Yellow Separators as exact bounding boxes for individual rows.
  3. It clusters words into columns based on their x coordinates.
  4. It handles "bleed" where letters from one column overlap another (e.g., long transaction descriptions).
  5. It stops automatically when it reaches the Closing Balance summary.

Source code:

import pdfplumber
import pandas as pd
import sys
import time
import re

def clean_value(text, is_balance=False):
    """Clean the extracted cell text."""
    if not text:
        return ""
    text = text.replace('(', '').replace(')', '').strip()
    
    if is_balance:
        text = re.sub(r'(\d)\s*(DR|CR)$', r'\1 \2', text)
        
    return text

def parse_iso_date(date_str, year):
    """Convert '01 Jan' + year to 'YYYY-MM-dd'."""
    try:
        if not date_str or not re.match(r'^\d{1,2}\s+[A-Za-z]+$', date_str):
            return date_str
            
        parts = date_str.split()
        day = parts[0].zfill(2)
        month_str = parts[1][:3].title()
        
        months = {
            "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04",
            "May": "05", "Jun": "06", "Jul": "07", "Aug": "08",
            "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
        }
        month_num = months.get(month_str)
        if not month_num:
            return date_str
            
        return f"{year}-{month_num}-{day}"
    except Exception:
        return date_str

def to_numeric_string(text):
    """Convert currency string to a clean numeric string (e.g., '1,234.56' -> '1234.56')."""
    if not text or text.lower() == 'nil':
        return ""
    # Remove $, commas, and extra whitespace
    clean = text.replace('$', '').replace(',', '').strip()
    # Match the numeric part (including decimal)
    match = re.search(r'[\d.]+', clean)
    return match.group(0) if match else ""

def get_rows_from_page(page, yellow_color=(0.0, 0.239, 0.941, 0.0)):
    h_elements = page.lines + page.rects + page.curves
    yellow_lines = []
    for elem in h_elements:
        colour = elem.get('stroking_color') or elem.get('non_stroking_color')
        if colour and tuple(colour) == yellow_color:
            width = elem.get('width') or (elem.get('x1', 0) - elem.get('x0', 0))
            if width > page.width * 0.3:
                yellow_lines.append(elem)
    
    if not yellow_lines:
        return [], False
    
    yellow_lines.sort(key=lambda x: x['top'])
    
    header_info = None
    header_line_index = -1
    for i, line in enumerate(yellow_lines):
        bbox = (0, max(0, line['top'] - 5), page.width, min(page.height, line['bottom'] + 25))
        crop = page.crop(bbox)
        text = crop.extract_text()
        
        if text and "Date" in text and "Transaction" in text:
            header_words = crop.extract_words()
            target_headers = ["Date", "Transaction", "Debit", "Credit", "Balance"]
            found_headers = {w['text']: w['x0'] for w in header_words if w['text'] in target_headers}
            
            if "Date" in found_headers and "Transaction" in found_headers:
                header_info = found_headers
                header_line_index = i
                start_y = line['bottom']
                break
    
    if header_line_index == -1:
        return [], False

    separators = []
    for line in yellow_lines[header_line_index + 1:]:
        y = line['top']
        if not separators or y - separators[-1] > 5:
            separators.append(y)
            
    rows = []
    stop_reached = False
    current_start_y = start_y
    
    # Precise boundaries
    col_x = [50, 89, 350, 395, 460]
    
    for end_y in separators:
        if end_y <= current_start_y + 2:
            continue
            
        bbox = (0, current_start_y, page.width, end_y)
        row_crop = page.crop(bbox)
        words = row_crop.extract_words()
        
        if words:
            cols = [[] for _ in range(5)]
            for w in words:
                x = w['x0']
                text = w['text']
                if x < col_x[0] - 5: continue
                
                months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
                split_done = False
                for m in months:
                    if text.startswith(m) and len(text) > len(m) and x < col_x[1] - 5:
                        cols[0].append(m)
                        cols[1].append(text[len(m):])
                        split_done = True
                        break
                if split_done: continue

                if x < col_x[1] - 5: cols[0].append(text)
                elif x < col_x[2] - 5: cols[1].append(text)
                elif x < col_x[3] - 5: cols[2].append(text)
                elif x < col_x[4] - 5: cols[3].append(text)
                else: cols[4].append(text)
            
            row_data = [
                " ".join(cols[0]).strip(), 
                " ".join(cols[1]).strip(),
                clean_value(" ".join(cols[2])),
                clean_value(" ".join(cols[3])),
                clean_value(" ".join(cols[4]), is_balance=True)
            ]
            
            tx_upper = row_data[1].upper()
            if "CLOSING BALANCE" in tx_upper:
                rows.append(row_data)
                stop_reached = True
                break
            
            if "SUMMARY" in tx_upper:
                stop_reached = True
                break
                
            if row_data[0] == "Date" and "Transaction" in row_data[1]:
                current_start_y = end_y
                continue

            if any(cell for cell in row_data):
                rows.append(row_data)
        
        current_start_y = end_y
        
    return rows, stop_reached

def custom_markdown_table(rows):
    if not rows: return ""
    header = ["Date", "Transaction", "Debit", "Credit", "Balance", "_iso_date", "_value", "_balance"]
    output = []
    output.append("| " + " | ".join(header) + " |")
    output.append("|" + "|".join("-" for _ in header) + "|")
    for row in rows:
        output.append("| " + " | ".join(cell.strip() for cell in row) + " |")
    return "\n".join(output)

def rows_to_tsv(rows):
    if not rows: return ""
    header = ["Date", "Transaction", "Debit", "Credit", "Balance", "_iso_date", "_value", "_balance"]
    output = []
    output.append("\t".join(header))
    for row in rows:
        output.append("\t".join(cell.strip() for cell in row))
    return "\n".join(output)

def main():
    args = sys.argv[1:]
    if not args:
        print("Usage: python script.py <pdf_path> [--type=tsv|md]", file=sys.stderr)
        sys.exit(1)
    
    pdf_path = None
    output_type = "tsv"
    
    i = 0
    while i < len(args):
        arg = args[i]
        if arg == "--type" and i + 1 < len(args):
            output_type = args[i+1].lower()
            i += 2
        elif arg.startswith("--type="):
            output_type = arg.split("=")[1].lower()
            i += 1
        elif arg.startswith("type="): # Keep legacy support
            output_type = arg.split("=")[1].lower()
            i += 1
        elif not pdf_path:
            pdf_path = arg
            i += 1
        else:
            i += 1
            
    if not pdf_path:
        print("Error: No PDF path provided.", file=sys.stderr)
        sys.exit(1)
    
    all_rows = []
    start_total = time.time()
    
    with pdfplumber.open(pdf_path) as pdf:
        # 1. Find the year on page 1
        first_page_text = pdf.pages[0].extract_text() or ""
        # Match 'Statement Period' followed by any text and a 4-digit year
        year_match = re.search(r'Period.*?\b(20\d{2})\b', first_page_text, re.DOTALL)
        if not year_match:
             # Fallback: look for any 20xx year on the first page
             year_match = re.search(r'\b(20\d{2})\b', first_page_text)
        
        statement_year = year_match.group(1) if year_match else "YYYY"
        print(f"Detected Year: {statement_year}", file=sys.stderr)

        for page_num, page in enumerate(pdf.pages, start=1):
            page_start = time.time()
            p_rows, stop = get_rows_from_page(page)
            if p_rows:
                # Add calculated columns
                for row in p_rows:
                    # Original columns: [Date, Tx, Debit, Credit, Balance]
                    date_val, tx_val, debit_val, credit_val, balance_val = row
                    
                    # 1. _iso_Date
                    iso_date = parse_iso_date(date_val, statement_year)
                    
                    # 2. _value
                    num_debit = to_numeric_string(debit_val)
                    num_credit = to_numeric_string(credit_val)
                    calc_value = ""
                    if num_credit:
                        calc_value = num_credit
                    elif num_debit:
                        calc_value = f"-{num_debit}"
                    
                    # 3. _balance
                    num_balance = to_numeric_string(balance_val)
                    calc_balance = num_balance
                    if "DR" in balance_val.upper() and num_balance:
                        calc_balance = f"-{num_balance}"
                    elif balance_val.lower() == 'nil':
                        calc_balance = "0.00"

                    # Append new columns in order: _iso_date, _value, _balance
                    row.extend([iso_date, calc_value, calc_balance])
                
                all_rows.extend(p_rows)
                print(f"Page {page_num}: extracted {len(p_rows)} rows ({time.time() - page_start:.3f}s)", file=sys.stderr)
                if stop: 
                    print(f"Page {page_num}: reached stop symbol, ending.", file=sys.stderr)
                    break
            else:
                print(f"Page {page_num}: skipped ({time.time() - page_start:.3f}s)", file=sys.stderr)
                    
    if output_type == "tsv":
        print(rows_to_tsv(all_rows))
    else:
        print(custom_markdown_table(all_rows))
        
    print(f"\nTotal time: {time.time() - start_total:.3f}s", file=sys.stderr)

if __name__ == "__main__":
    main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment