I have published this as an open source website, please check it out here:
-
Web Site URL: https://alshdavid.github.io/commbank-statement-converter
-
Source Code: https://alshdavid.github.io/commbank-statement-converter
I have published this as an open source website, please check it out here:
Web Site URL: https://alshdavid.github.io/commbank-statement-converter
Source Code: https://alshdavid.github.io/commbank-statement-converter
| "use strict";var t=this&&this.__importDefault||function(t){return t&&t.__esModule?t:{default:t}};Object.defineProperty(exports,"__esModule",{value:!0}),exports.parsePDF=void 0;const e=t(require("fs")),n=t(require("pdf-parse")),s=require("json2csv");async function r(t,r){const l=e.default.readFileSync(t),o=(await n.default(l)).text.split("TransactionDebitCreditBalance");o.shift();let a=[],c=0;for(let t of o){const{results:e,finalBalance:n}=i(t,c);c=n,a=[...a,...e]}if(r&&r.endsWith(".json"))e.default.writeFileSync(r,JSON.stringify(a,null,2));else if(r&&r.endsWith(".csv")){const t=s.parse(a);e.default.writeFileSync(r,t)}else console.log(a)}function i(t,e=0){const{openingBalance:n,firstPassResults:s}=function(t){const e=t.split("\n");let n;e.shift();const s=[];if(function(t){return t.includes("OPENING BALANCE")}(e[0])){n=l((e.shift()||"").split("$")[1])}let r=!1;for(const t of e)if(!0!==r){if(""===t)break;if(t.includes("CLOSING BALANCE"))break;t.includes("CREDIT INTEREST EARNED")?r=!0:s.push(t)}else r=!1;return{firstPassResults:s,openingBalance:n}}(t);return function(t,e){const n=[];let s=e;for(const e of t){const t=e.substring(0,6);let[r,...i]=e.substring(6,e.length).split("$");const c=o(i[0]),u=l(i[1]),f=u-s;let g=0;if(0===c){let e=a(f);e.startsWith("-")&&(e=e.substring(1,e.length));e!==r.substring(r.length-e.length,r.length)&&(console.log("WrongDebitCalc"),console.log({newBalance:u-Math.abs(f),diff:Math.abs(f),ds:e,date:t,label:r,debit:g,credit:c,balance:u})),g=Math.abs(f),r=r.substring(0,r.length-e.length)}n.push({date:t,label:r,debit:g,credit:c,balance:u}),s=u}return{results:n,finalBalance:s}}(function(t){const e=[];let n="";for(const s of t){1===s.split("$").length?n+=s:(e.push(n+s),n="")}return e}(s),n||e)}function l(t){if(t.includes("CR"))return o(t.split("CR")[0]);if(t.includes("DR"))return-Math.abs(o(t.split("DR")[0]));throw"No Balance"}function o(t){return t?parseInt(t.replace(/,/g,"").replace(/\./g,"")):0}function a(t){const e=t.toString().startsWith("-")?"-":"",n=t.toString().replace("-","");if(1===n.length)return e+"0.0"+n;if(2===n.length)return e+"0."+n;const s=n.substring(0,n.length-2),r=n.substring(n.length-2,n.length);return parseInt(s).toLocaleString()+"."+r}process.argv[2]&&r(process.argv[2],process.argv[3]),exports.parsePDF=r |
| Usage | |
| Contact alshdavid@gmail.com for help | |
| Download and install node from https://nodejs.org/en/download/ | |
| Copy the "index.js" script below to you computer. | |
| Open up your terminal, cmd, or powershell. | |
| run the following once: | |
| npm install pdf-parse json2csv | |
| Navigate to the directory then run the following for each statement: | |
| node index.js ./statement.pdf ./output.csv | |
| If you want to output a JSON file | |
| node index.js ./statement.pdf ./output.json |
| import fs from 'fs' | |
| import pdf from 'pdf-parse' | |
| import { parse } from 'json2csv'; | |
| export type Cents = number | |
| export type Record = { | |
| date: string, | |
| label: string, | |
| debit: Cents, | |
| credit: Cents, | |
| balance: Cents | |
| }; | |
| if (process.argv[2]) { | |
| parsePDF(process.argv[2], process.argv[3]) | |
| } | |
| export async function parsePDF(file: string, outputFile?: string){ | |
| const dataBuffer = fs.readFileSync(file); | |
| const data = await pdf(dataBuffer) | |
| const vs = data.text.split('TransactionDebitCreditBalance') | |
| vs.shift() | |
| let output: Record[] = [] | |
| let previousBalance = 0 | |
| for (let v of vs) { | |
| const { results, finalBalance } = processTable(v, previousBalance) | |
| previousBalance = finalBalance | |
| output = [ ...output, ...results ] | |
| } | |
| if (outputFile && outputFile.endsWith('.json')) { | |
| fs.writeFileSync(outputFile, JSON.stringify(output, null, 2)) | |
| } else if (outputFile && outputFile.endsWith('.csv')) { | |
| const csv = parse(output); | |
| fs.writeFileSync(outputFile, csv) | |
| } else { | |
| console.log(output) | |
| } | |
| } | |
| function isOpeningBalance(target: string) { | |
| return target.includes('OPENING BALANCE') | |
| } | |
| function getTabletLines(target: string): { firstPassResults: string[], openingBalance: number | undefined } { | |
| const l = target.split('\n') | |
| l.shift() | |
| let openingBalance: number | undefined | |
| const p = [] | |
| if (isOpeningBalance(l[0])) { | |
| const openingLine = l.shift() || '' | |
| openingBalance = parseBalance(openingLine.split('$')[1]) | |
| } | |
| let skip = false | |
| for (const line of l) { | |
| if (skip === true) { | |
| skip = false | |
| continue | |
| } | |
| if (line === '') { | |
| break | |
| } | |
| if (line.includes('CLOSING BALANCE')) { | |
| break | |
| } | |
| if (line.includes('CREDIT INTEREST EARNED')) { | |
| skip = true | |
| continue | |
| } | |
| p.push(line) | |
| } | |
| return { | |
| firstPassResults: p, | |
| openingBalance | |
| } | |
| } | |
| function squashLines(target: string[]): string[] { | |
| const p2: string[] = [] | |
| let temp = '' | |
| for (const line of target) { | |
| const v = line.split('$') | |
| if (v.length === 1) { | |
| temp += line | |
| } else { | |
| p2.push(temp + line) | |
| temp = '' | |
| } | |
| } | |
| return p2 | |
| } | |
| function generateRecords(target: string[], initialBalance: number): { results: Record[], finalBalance: Cents } { | |
| const p3: Record[] = [] | |
| let previousBalance = initialBalance | |
| for (const line of target) { | |
| const date = line.substring(0, 6) | |
| let [ label, ...data ] = line.substring(6, line.length).split('$') | |
| const credit = moneyToCents(data[0]) | |
| const balance = parseBalance(data[1]) | |
| const diff = balance - previousBalance | |
| let debit = 0 | |
| if (credit === 0) { | |
| let ds = centsToMoney(diff) | |
| if (ds.startsWith('-')) { | |
| ds = ds.substring(1, ds.length) | |
| } | |
| const diffInLabel = label.substring(label.length - ds.length, label.length) | |
| if (ds !== diffInLabel) { | |
| console.log('WrongDebitCalc') | |
| console.log({ | |
| newBalance: balance - Math.abs(diff), | |
| diff: Math.abs(diff), | |
| ds, | |
| date, | |
| label, | |
| debit, | |
| credit, | |
| balance | |
| }) | |
| } | |
| debit = Math.abs(diff) | |
| label = label.substring(0, label.length - ds.length) | |
| } | |
| p3.push({ | |
| date, | |
| label, | |
| debit, | |
| credit, | |
| balance | |
| }) | |
| previousBalance = balance | |
| } | |
| return { | |
| results: p3, | |
| finalBalance: previousBalance | |
| } | |
| } | |
| function processTable(t: string, initialBalance: Cents = 0) { | |
| const { openingBalance, firstPassResults } = getTabletLines(t) | |
| const squashedLines = squashLines(firstPassResults) | |
| return generateRecords(squashedLines, openingBalance || initialBalance) | |
| } | |
| function parseBalance(balance: string): Cents { | |
| if (balance.includes('CR')) { | |
| return moneyToCents(balance.split('CR')[0]) | |
| } if (balance.includes('DR')) { | |
| return -Math.abs(moneyToCents(balance.split('DR')[0])) | |
| } | |
| throw 'No Balance' | |
| } | |
| function moneyToCents(money: string): Cents { | |
| return money ? parseInt(money.replace(/,/g, '').replace(/\./g, '')) : 0 | |
| } | |
| function centsToMoney(input: Cents): string { | |
| const negative = input.toString().startsWith('-') ? '-' : '' | |
| const iS = input.toString().replace('-', '') | |
| if (iS.length === 1) { | |
| return negative + '0.0'+ iS | |
| } | |
| if (iS.length === 2) { | |
| return negative + '0.'+ iS | |
| } | |
| const dollars = iS.substring(0, iS.length - 2) | |
| const cents = iS.substring(iS.length - 2, iS.length) | |
| const localDollars = parseInt(dollars).toLocaleString() | |
| return localDollars + '.' + cents | |
| } |
Updated this to work from in the browser:
URL: https://alshdavid.github.io/commbank-statement-converter
Repo: https://github.com/alshdavid/commbank-statement-converter
I love your work. I could, however, not get it to work properly. I made this python script that solves the same problem.
The default output is .tsv that you can copy paste directly into a spreadsheet
Tool to extract transaction tables from Commonwealth Bank (CommBank, CBA) PDF bank statements and convert them into clean Markdown or TSV tables.
Nil values representing empty cells.You will need Python 3.7+ installed.
Create a virtual environment (recommended):
python -m venv .venv
# On macOS/Linux
source .venv/bin/activate
# On Windows
.venv\Scripts\activate Install dependencies:
pip install pdfplumber pandasRun the script from your terminal:
python CBApdf2table.py input/Statements20190630.pdf > output.tsvpython CBApdf2table.py --type md input/Statements20190630.pdf > output.mdThe script specifically looks for the yellow branding lines used in CommBank statements:
x coordinates.import pdfplumber
import pandas as pd
import sys
import time
import re
def clean_value(text, is_balance=False):
"""Clean the extracted cell text."""
if not text:
return ""
text = text.replace('(', '').replace(')', '').strip()
if is_balance:
text = re.sub(r'(\d)\s*(DR|CR)$', r'\1 \2', text)
return text
def parse_iso_date(date_str, year):
"""Convert '01 Jan' + year to 'YYYY-MM-dd'."""
try:
if not date_str or not re.match(r'^\d{1,2}\s+[A-Za-z]+$', date_str):
return date_str
parts = date_str.split()
day = parts[0].zfill(2)
month_str = parts[1][:3].title()
months = {
"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04",
"May": "05", "Jun": "06", "Jul": "07", "Aug": "08",
"Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
}
month_num = months.get(month_str)
if not month_num:
return date_str
return f"{year}-{month_num}-{day}"
except Exception:
return date_str
def to_numeric_string(text):
"""Convert currency string to a clean numeric string (e.g., '1,234.56' -> '1234.56')."""
if not text or text.lower() == 'nil':
return ""
# Remove $, commas, and extra whitespace
clean = text.replace('$', '').replace(',', '').strip()
# Match the numeric part (including decimal)
match = re.search(r'[\d.]+', clean)
return match.group(0) if match else ""
def get_rows_from_page(page, yellow_color=(0.0, 0.239, 0.941, 0.0)):
h_elements = page.lines + page.rects + page.curves
yellow_lines = []
for elem in h_elements:
colour = elem.get('stroking_color') or elem.get('non_stroking_color')
if colour and tuple(colour) == yellow_color:
width = elem.get('width') or (elem.get('x1', 0) - elem.get('x0', 0))
if width > page.width * 0.3:
yellow_lines.append(elem)
if not yellow_lines:
return [], False
yellow_lines.sort(key=lambda x: x['top'])
header_info = None
header_line_index = -1
for i, line in enumerate(yellow_lines):
bbox = (0, max(0, line['top'] - 5), page.width, min(page.height, line['bottom'] + 25))
crop = page.crop(bbox)
text = crop.extract_text()
if text and "Date" in text and "Transaction" in text:
header_words = crop.extract_words()
target_headers = ["Date", "Transaction", "Debit", "Credit", "Balance"]
found_headers = {w['text']: w['x0'] for w in header_words if w['text'] in target_headers}
if "Date" in found_headers and "Transaction" in found_headers:
header_info = found_headers
header_line_index = i
start_y = line['bottom']
break
if header_line_index == -1:
return [], False
separators = []
for line in yellow_lines[header_line_index + 1:]:
y = line['top']
if not separators or y - separators[-1] > 5:
separators.append(y)
rows = []
stop_reached = False
current_start_y = start_y
# Precise boundaries
col_x = [50, 89, 350, 395, 460]
for end_y in separators:
if end_y <= current_start_y + 2:
continue
bbox = (0, current_start_y, page.width, end_y)
row_crop = page.crop(bbox)
words = row_crop.extract_words()
if words:
cols = [[] for _ in range(5)]
for w in words:
x = w['x0']
text = w['text']
if x < col_x[0] - 5: continue
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
split_done = False
for m in months:
if text.startswith(m) and len(text) > len(m) and x < col_x[1] - 5:
cols[0].append(m)
cols[1].append(text[len(m):])
split_done = True
break
if split_done: continue
if x < col_x[1] - 5: cols[0].append(text)
elif x < col_x[2] - 5: cols[1].append(text)
elif x < col_x[3] - 5: cols[2].append(text)
elif x < col_x[4] - 5: cols[3].append(text)
else: cols[4].append(text)
row_data = [
" ".join(cols[0]).strip(),
" ".join(cols[1]).strip(),
clean_value(" ".join(cols[2])),
clean_value(" ".join(cols[3])),
clean_value(" ".join(cols[4]), is_balance=True)
]
tx_upper = row_data[1].upper()
if "CLOSING BALANCE" in tx_upper:
rows.append(row_data)
stop_reached = True
break
if "SUMMARY" in tx_upper:
stop_reached = True
break
if row_data[0] == "Date" and "Transaction" in row_data[1]:
current_start_y = end_y
continue
if any(cell for cell in row_data):
rows.append(row_data)
current_start_y = end_y
return rows, stop_reached
def custom_markdown_table(rows):
if not rows: return ""
header = ["Date", "Transaction", "Debit", "Credit", "Balance", "_iso_date", "_value", "_balance"]
output = []
output.append("| " + " | ".join(header) + " |")
output.append("|" + "|".join("-" for _ in header) + "|")
for row in rows:
output.append("| " + " | ".join(cell.strip() for cell in row) + " |")
return "\n".join(output)
def rows_to_tsv(rows):
if not rows: return ""
header = ["Date", "Transaction", "Debit", "Credit", "Balance", "_iso_date", "_value", "_balance"]
output = []
output.append("\t".join(header))
for row in rows:
output.append("\t".join(cell.strip() for cell in row))
return "\n".join(output)
def main():
args = sys.argv[1:]
if not args:
print("Usage: python script.py <pdf_path> [--type=tsv|md]", file=sys.stderr)
sys.exit(1)
pdf_path = None
output_type = "tsv"
i = 0
while i < len(args):
arg = args[i]
if arg == "--type" and i + 1 < len(args):
output_type = args[i+1].lower()
i += 2
elif arg.startswith("--type="):
output_type = arg.split("=")[1].lower()
i += 1
elif arg.startswith("type="): # Keep legacy support
output_type = arg.split("=")[1].lower()
i += 1
elif not pdf_path:
pdf_path = arg
i += 1
else:
i += 1
if not pdf_path:
print("Error: No PDF path provided.", file=sys.stderr)
sys.exit(1)
all_rows = []
start_total = time.time()
with pdfplumber.open(pdf_path) as pdf:
# 1. Find the year on page 1
first_page_text = pdf.pages[0].extract_text() or ""
# Match 'Statement Period' followed by any text and a 4-digit year
year_match = re.search(r'Period.*?\b(20\d{2})\b', first_page_text, re.DOTALL)
if not year_match:
# Fallback: look for any 20xx year on the first page
year_match = re.search(r'\b(20\d{2})\b', first_page_text)
statement_year = year_match.group(1) if year_match else "YYYY"
print(f"Detected Year: {statement_year}", file=sys.stderr)
for page_num, page in enumerate(pdf.pages, start=1):
page_start = time.time()
p_rows, stop = get_rows_from_page(page)
if p_rows:
# Add calculated columns
for row in p_rows:
# Original columns: [Date, Tx, Debit, Credit, Balance]
date_val, tx_val, debit_val, credit_val, balance_val = row
# 1. _iso_Date
iso_date = parse_iso_date(date_val, statement_year)
# 2. _value
num_debit = to_numeric_string(debit_val)
num_credit = to_numeric_string(credit_val)
calc_value = ""
if num_credit:
calc_value = num_credit
elif num_debit:
calc_value = f"-{num_debit}"
# 3. _balance
num_balance = to_numeric_string(balance_val)
calc_balance = num_balance
if "DR" in balance_val.upper() and num_balance:
calc_balance = f"-{num_balance}"
elif balance_val.lower() == 'nil':
calc_balance = "0.00"
# Append new columns in order: _iso_date, _value, _balance
row.extend([iso_date, calc_value, calc_balance])
all_rows.extend(p_rows)
print(f"Page {page_num}: extracted {len(p_rows)} rows ({time.time() - page_start:.3f}s)", file=sys.stderr)
if stop:
print(f"Page {page_num}: reached stop symbol, ending.", file=sys.stderr)
break
else:
print(f"Page {page_num}: skipped ({time.time() - page_start:.3f}s)", file=sys.stderr)
if output_type == "tsv":
print(rows_to_tsv(all_rows))
else:
print(custom_markdown_table(all_rows))
print(f"\nTotal time: {time.time() - start_total:.3f}s", file=sys.stderr)
if __name__ == "__main__":
main()
You are a legend mate!