I have published this as an open source website, please check it out here:
-
Web Site URL: https://alshdavid.github.io/commbank-statement-converter
-
Source Code: https://alshdavid.github.io/commbank-statement-converter
I have published this as an open source website, please check it out here:
Web Site URL: https://alshdavid.github.io/commbank-statement-converter
Source Code: https://alshdavid.github.io/commbank-statement-converter
| "use strict";var t=this&&this.__importDefault||function(t){return t&&t.__esModule?t:{default:t}};Object.defineProperty(exports,"__esModule",{value:!0}),exports.parsePDF=void 0;const e=t(require("fs")),n=t(require("pdf-parse")),s=require("json2csv");async function r(t,r){const l=e.default.readFileSync(t),o=(await n.default(l)).text.split("TransactionDebitCreditBalance");o.shift();let a=[],c=0;for(let t of o){const{results:e,finalBalance:n}=i(t,c);c=n,a=[...a,...e]}if(r&&r.endsWith(".json"))e.default.writeFileSync(r,JSON.stringify(a,null,2));else if(r&&r.endsWith(".csv")){const t=s.parse(a);e.default.writeFileSync(r,t)}else console.log(a)}function i(t,e=0){const{openingBalance:n,firstPassResults:s}=function(t){const e=t.split("\n");let n;e.shift();const s=[];if(function(t){return t.includes("OPENING BALANCE")}(e[0])){n=l((e.shift()||"").split("$")[1])}let r=!1;for(const t of e)if(!0!==r){if(""===t)break;if(t.includes("CLOSING BALANCE"))break;t.includes("CREDIT INTEREST EARNED")?r=!0:s.push(t)}else r=!1;return{firstPassResults:s,openingBalance:n}}(t);return function(t,e){const n=[];let s=e;for(const e of t){const t=e.substring(0,6);let[r,...i]=e.substring(6,e.length).split("$");const c=o(i[0]),u=l(i[1]),f=u-s;let g=0;if(0===c){let e=a(f);e.startsWith("-")&&(e=e.substring(1,e.length));e!==r.substring(r.length-e.length,r.length)&&(console.log("WrongDebitCalc"),console.log({newBalance:u-Math.abs(f),diff:Math.abs(f),ds:e,date:t,label:r,debit:g,credit:c,balance:u})),g=Math.abs(f),r=r.substring(0,r.length-e.length)}n.push({date:t,label:r,debit:g,credit:c,balance:u}),s=u}return{results:n,finalBalance:s}}(function(t){const e=[];let n="";for(const s of t){1===s.split("$").length?n+=s:(e.push(n+s),n="")}return e}(s),n||e)}function l(t){if(t.includes("CR"))return o(t.split("CR")[0]);if(t.includes("DR"))return-Math.abs(o(t.split("DR")[0]));throw"No Balance"}function o(t){return t?parseInt(t.replace(/,/g,"").replace(/\./g,"")):0}function a(t){const e=t.toString().startsWith("-")?"-":"",n=t.toString().replace("-","");if(1===n.length)return e+"0.0"+n;if(2===n.length)return e+"0."+n;const s=n.substring(0,n.length-2),r=n.substring(n.length-2,n.length);return parseInt(s).toLocaleString()+"."+r}process.argv[2]&&r(process.argv[2],process.argv[3]),exports.parsePDF=r |
| Usage | |
| Contact alshdavid@gmail.com for help | |
| Download and install node from https://nodejs.org/en/download/ | |
| Copy the "index.js" script below to you computer. | |
| Open up your terminal, cmd, or powershell. | |
| run the following once: | |
| npm install pdf-parse json2csv | |
| Navigate to the directory then run the following for each statement: | |
| node index.js ./statement.pdf ./output.csv | |
| If you want to output a JSON file | |
| node index.js ./statement.pdf ./output.json |
| import fs from 'fs' | |
| import pdf from 'pdf-parse' | |
| import { parse } from 'json2csv'; | |
| export type Cents = number | |
| export type Record = { | |
| date: string, | |
| label: string, | |
| debit: Cents, | |
| credit: Cents, | |
| balance: Cents | |
| }; | |
| if (process.argv[2]) { | |
| parsePDF(process.argv[2], process.argv[3]) | |
| } | |
| export async function parsePDF(file: string, outputFile?: string){ | |
| const dataBuffer = fs.readFileSync(file); | |
| const data = await pdf(dataBuffer) | |
| const vs = data.text.split('TransactionDebitCreditBalance') | |
| vs.shift() | |
| let output: Record[] = [] | |
| let previousBalance = 0 | |
| for (let v of vs) { | |
| const { results, finalBalance } = processTable(v, previousBalance) | |
| previousBalance = finalBalance | |
| output = [ ...output, ...results ] | |
| } | |
| if (outputFile && outputFile.endsWith('.json')) { | |
| fs.writeFileSync(outputFile, JSON.stringify(output, null, 2)) | |
| } else if (outputFile && outputFile.endsWith('.csv')) { | |
| const csv = parse(output); | |
| fs.writeFileSync(outputFile, csv) | |
| } else { | |
| console.log(output) | |
| } | |
| } | |
| function isOpeningBalance(target: string) { | |
| return target.includes('OPENING BALANCE') | |
| } | |
| function getTabletLines(target: string): { firstPassResults: string[], openingBalance: number | undefined } { | |
| const l = target.split('\n') | |
| l.shift() | |
| let openingBalance: number | undefined | |
| const p = [] | |
| if (isOpeningBalance(l[0])) { | |
| const openingLine = l.shift() || '' | |
| openingBalance = parseBalance(openingLine.split('$')[1]) | |
| } | |
| let skip = false | |
| for (const line of l) { | |
| if (skip === true) { | |
| skip = false | |
| continue | |
| } | |
| if (line === '') { | |
| break | |
| } | |
| if (line.includes('CLOSING BALANCE')) { | |
| break | |
| } | |
| if (line.includes('CREDIT INTEREST EARNED')) { | |
| skip = true | |
| continue | |
| } | |
| p.push(line) | |
| } | |
| return { | |
| firstPassResults: p, | |
| openingBalance | |
| } | |
| } | |
| function squashLines(target: string[]): string[] { | |
| const p2: string[] = [] | |
| let temp = '' | |
| for (const line of target) { | |
| const v = line.split('$') | |
| if (v.length === 1) { | |
| temp += line | |
| } else { | |
| p2.push(temp + line) | |
| temp = '' | |
| } | |
| } | |
| return p2 | |
| } | |
| function generateRecords(target: string[], initialBalance: number): { results: Record[], finalBalance: Cents } { | |
| const p3: Record[] = [] | |
| let previousBalance = initialBalance | |
| for (const line of target) { | |
| const date = line.substring(0, 6) | |
| let [ label, ...data ] = line.substring(6, line.length).split('$') | |
| const credit = moneyToCents(data[0]) | |
| const balance = parseBalance(data[1]) | |
| const diff = balance - previousBalance | |
| let debit = 0 | |
| if (credit === 0) { | |
| let ds = centsToMoney(diff) | |
| if (ds.startsWith('-')) { | |
| ds = ds.substring(1, ds.length) | |
| } | |
| const diffInLabel = label.substring(label.length - ds.length, label.length) | |
| if (ds !== diffInLabel) { | |
| console.log('WrongDebitCalc') | |
| console.log({ | |
| newBalance: balance - Math.abs(diff), | |
| diff: Math.abs(diff), | |
| ds, | |
| date, | |
| label, | |
| debit, | |
| credit, | |
| balance | |
| }) | |
| } | |
| debit = Math.abs(diff) | |
| label = label.substring(0, label.length - ds.length) | |
| } | |
| p3.push({ | |
| date, | |
| label, | |
| debit, | |
| credit, | |
| balance | |
| }) | |
| previousBalance = balance | |
| } | |
| return { | |
| results: p3, | |
| finalBalance: previousBalance | |
| } | |
| } | |
| function processTable(t: string, initialBalance: Cents = 0) { | |
| const { openingBalance, firstPassResults } = getTabletLines(t) | |
| const squashedLines = squashLines(firstPassResults) | |
| return generateRecords(squashedLines, openingBalance || initialBalance) | |
| } | |
| function parseBalance(balance: string): Cents { | |
| if (balance.includes('CR')) { | |
| return moneyToCents(balance.split('CR')[0]) | |
| } if (balance.includes('DR')) { | |
| return -Math.abs(moneyToCents(balance.split('DR')[0])) | |
| } | |
| throw 'No Balance' | |
| } | |
| function moneyToCents(money: string): Cents { | |
| return money ? parseInt(money.replace(/,/g, '').replace(/\./g, '')) : 0 | |
| } | |
| function centsToMoney(input: Cents): string { | |
| const negative = input.toString().startsWith('-') ? '-' : '' | |
| const iS = input.toString().replace('-', '') | |
| if (iS.length === 1) { | |
| return negative + '0.0'+ iS | |
| } | |
| if (iS.length === 2) { | |
| return negative + '0.'+ iS | |
| } | |
| const dollars = iS.substring(0, iS.length - 2) | |
| const cents = iS.substring(iS.length - 2, iS.length) | |
| const localDollars = parseInt(dollars).toLocaleString() | |
| return localDollars + '.' + cents | |
| } |
I love your work. I could, however, not get it to work properly. I made this python script that solves the same problem.
The default output is .tsv that you can copy paste directly into a spreadsheet
CommBank Statement to Markdown/TSV Converter
Tool to extract transaction tables from Commonwealth Bank (CommBank, CBA) PDF bank statements and convert them into clean Markdown or TSV tables.
Features
Nilvalues representing empty cells.Installation
Prerequisites
You will need Python 3.7+ installed.
Setup
Create a virtual environment (recommended):
Install dependencies:
Usage
Run the script from your terminal:
Default tsv Output
python CBApdf2table.py input/Statements20190630.pdf > output.tsvMarkdown Output
python CBApdf2table.py --type md input/Statements20190630.pdf > output.mdHow it Works
The script specifically looks for the yellow branding lines used in CommBank statements:
xcoordinates.Source code: