Skip to main content

Data Import

Core Import Tools

Command-line Interface

# Identify importable files
bean-identify config.py ~/Downloads

# Extract transactions
bean-extract -e ledger.beancount config.py ~/Downloads > extracted.beancount

# File documents
bean-file -o ~/documents config.py ~/Downloads

Importer Implementation

Basic Importer Structure

from beancount.ingest import importer
from beancount.core import data

class CustomImporter(importer.ImporterProtocol):
def __init__(self, account_root):
self.account_root = account_root

def identify(self, file):
"""Return true if this importer can handle the file."""
return file.name.endswith('.csv')

def extract(self, file):
"""Extract Beancount directives from file."""
entries = []
# Process file and create entries
return entries

def file_account(self):
"""Return account for filing."""
return self.account_root

def file_name(self, file):
"""Return desired filed filename."""
return 'renamed-{}'.format(file.name)

def file_date(self, file):
"""Extract date from file contents."""
return datetime.date.today()

Configuration Setup

#!/usr/bin/env python3
from importers import bank, investment, credit

CONFIG = [
bank.Importer(
account='Assets:US:Bank:Checking',
currency='USD'
),
investment.Importer(
account='Assets:US:Investment',
commission_account='Expenses:Fees:Commission'
),
credit.Importer(
account='Liabilities:US:Credit',
currency='USD'
)
]

Testing Framework

Regression Test Setup

from beancount.ingest import regression

class TestImporter(unittest.TestCase):
@regression.check_file(
account="Assets:Test",
regexp_mime="text/csv"
)
def test_basic(self, importer, file):
"""Basic CSV import test."""
entries = importer.extract(file)
self.assertTrue(entries)

Test File Structure

importers/
├── __init__.py
├── bank/
│ ├── __init__.py
│ ├── importer.py
│ ├── test_sample.csv
│ ├── test_sample.csv.extract
│ └── test_sample.csv.file_date

File Processing

File Conversion Cache

def process_file(file):
# Use file.convert() for caching
text = file.convert(conversion_function)
return text

def conversion_function(filename):
"""Convert file content with caching."""
# Implementation

PDF Processing

def extract_pdf_text(filename):
"""Extract text from PDF with fallbacks."""
try:
return extract_with_pdfminer(filename)
except:
try:
return extract_with_pdftotext(filename)
except:
return extract_with_poppler(filename)

Directory Organization

project/
├── documents/
│ ├── Assets/
│ ├── Liabilities/
│ └── Income/
├── importers/
│ └── custom/
├── ledger.beancount
└── import.py

File Moving Logic

def determine_file_path(importer, file):
"""Determine final path for imported file."""
date = importer.file_date(file)
account = importer.file_account()
clean_name = importer.file_name(file)
return f"{date.strftime('%Y-%m-%d')}-{clean_name}"

Best Practices

  1. Error Handling
def safe_extract(self, file):
"""Extract with robust error handling."""
try:
return self.extract(file)
except Exception as e:
logger.error("Extraction failed: %s", str(e))
return []
  1. File Validation
def validate_file(self, file):
"""Validate file before processing."""
if not self._check_header(file):
return False
if not self._verify_structure(file):
return False
return True
  1. Data Normalization
def clean_payee(self, payee):
"""Normalize payee names."""
payee = re.sub(r'\s+', ' ', payee)
payee = payee.strip().upper()
return self.payee_map.get(payee, payee)

This guide provides a technical foundation for implementing data import capabilities in Beancount while maintaining precise control over the import process.