Data Import
Core Import Tools
Command-line Interface
# Identify importable files
bean-identify config.py ~/Downloads
# Extract transactions
bean-extract -e ledger.beancount config.py ~/Downloads > extracted.beancount
# File documents
bean-file -o ~/documents config.py ~/Downloads
Importer Implementation
Basic Importer Structure
from beancount.ingest import importer
from beancount.core import data
class CustomImporter(importer.ImporterProtocol):
def __init__(self, account_root):
self.account_root = account_root
def identify(self, file):
"""Return true if this importer can handle the file."""
return file.name.endswith('.csv')
def extract(self, file):
"""Extract Beancount directives from file."""
entries = []
# Process file and create entries
return entries
def file_account(self):
"""Return account for filing."""
return self.account_root
def file_name(self, file):
"""Return desired filed filename."""
return 'renamed-{}'.format(file.name)
def file_date(self, file):
"""Extract date from file contents."""
return datetime.date.today()
Configuration Setup
#!/usr/bin/env python3
from importers import bank, investment, credit
CONFIG = [
bank.Importer(
account='Assets:US:Bank:Checking',
currency='USD'
),
investment.Importer(
account='Assets:US:Investment',
commission_account='Expenses:Fees:Commission'
),
credit.Importer(
account='Liabilities:US:Credit',
currency='USD'
)
]
Testing Framework
Regression Test Setup
from beancount.ingest import regression
class TestImporter(unittest.TestCase):
@regression.check_file(
account="Assets:Test",
regexp_mime="text/csv"
)
def test_basic(self, importer, file):
"""Basic CSV import test."""
entries = importer.extract(file)
self.assertTrue(entries)
Test File Structure
importers/
├── __init__.py
├── bank/
│ ├── __init__.py
│ ├── importer.py
│ ├── test_sample.csv
│ ├── test_sample.csv.extract
│ └── test_sample.csv.file_date
File Processing
File Conversion Cache
def process_file(file):
# Use file.convert() for caching
text = file.convert(conversion_function)
return text
def conversion_function(filename):
"""Convert file content with caching."""
# Implementation
PDF Processing
def extract_pdf_text(filename):
"""Extract text from PDF with fallbacks."""
try:
return extract_with_pdfminer(filename)
except:
try:
return extract_with_pdftotext(filename)
except:
return extract_with_poppler(filename)
Directory Organization
Recommended Structure
project/
├── documents/
│ ├── Assets/
│ ├── Liabilities/
│ └── Income/
├── importers/
│ └── custom/
├── ledger.beancount
└── import.py
File Moving Logic
def determine_file_path(importer, file):
"""Determine final path for imported file."""
date = importer.file_date(file)
account = importer.file_account()
clean_name = importer.file_name(file)
return f"{date.strftime('%Y-%m-%d')}-{clean_name}"
Best Practices
- Error Handling
def safe_extract(self, file):
"""Extract with robust error handling."""
try:
return self.extract(file)
except Exception as e:
logger.error("Extraction failed: %s", str(e))
return []
- File Validation
def validate_file(self, file):
"""Validate file before processing."""
if not self._check_header(file):
return False
if not self._verify_structure(file):
return False
return True
- Data Normalization
def clean_payee(self, payee):
"""Normalize payee names."""
payee = re.sub(r'\s+', ' ', payee)
payee = payee.strip().upper()
return self.payee_map.get(payee, payee)
This guide provides a technical foundation for implementing data import capabilities in Beancount while maintaining precise control over the import process.