From 19aabc070bf7a159068c40f51b9f6b38cbeeb083 Mon Sep 17 00:00:00 2001 From: Nicolas Schodet Date: Mon, 30 Sep 2019 00:09:07 +0200 Subject: Importers for Crédit Agricole OFX export and MHTML page dump --- importers/.gitignore | 1 + importers/__init__.py | 0 importers/conftest.py | 2 + importers/cragr/__init__.py | 0 importers/cragr/cragr_mhtml.py | 178 +++++++++++++++++++++++++++++++++++++++++ importers/cragr/cragr_ofx.py | 48 +++++++++++ 6 files changed, 229 insertions(+) create mode 100644 importers/.gitignore create mode 100644 importers/__init__.py create mode 100644 importers/conftest.py create mode 100644 importers/cragr/__init__.py create mode 100644 importers/cragr/cragr_mhtml.py create mode 100644 importers/cragr/cragr_ofx.py diff --git a/importers/.gitignore b/importers/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/importers/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/importers/__init__.py b/importers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/importers/conftest.py b/importers/conftest.py new file mode 100644 index 0000000..a763223 --- /dev/null +++ b/importers/conftest.py @@ -0,0 +1,2 @@ +# This adds the --generate option. +pytest_plugins = "beancount.ingest.regression_pytest" diff --git a/importers/cragr/__init__.py b/importers/cragr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/importers/cragr/cragr_mhtml.py b/importers/cragr/cragr_mhtml.py new file mode 100644 index 0000000..280d25c --- /dev/null +++ b/importers/cragr/cragr_mhtml.py @@ -0,0 +1,178 @@ +"""Importer for Crédit Agricole webpages.""" +# +# Copyright (C) 2019 Nicolas Schodet +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +import datetime +import decimal +import email.parser +import email.policy +import os.path +import re +from lxml import etree + +from beancount.ingest import importer +from beancount.core import data, amount + + +class Importer(importer.ImporterProtocol): + """An importer for Crédit Agricole webpages saved as MHTML.""" + + def __init__(self, account_number, account, basename=None): + """Create a new importer posting to the given account. + + Args: + account_number: Account number. + account: An account string, the account onto which to post all the + amounts parsed. + basename: An optional string, the name of the new files. + """ + self.account_number = account_number + self.account = account + self.basename = basename + + def name(self): + """Include the filing account in the name.""" + return '{}: "{}"'.format(super().name(), self.file_account(None)) + + def identify(self, file): + """Check whether a file is handled by this importer.""" + if file.mimetype() != 'message/rfc822': + return False + tree = file.convert(gettree) + accounts = get_header(tree).xpath( + ".//div[contains(@class,'OperationMainAccount-headerNumber')]" + "/text()") + if accounts and re.match(r' *N° +' + self.account_number, + accounts[0]): + return True + return False + + def file_account(self, _): + """Return the account against which we post transactions.""" + return self.account + + def file_name(self, file): + """Return the optional renamed account filename.""" + if self.basename: + return self.basename + os.path.splitext(file.name)[1] + + def file_date(self, file): + """Return the file date.""" + tree = file.convert(gettree) + operations = get_operations(tree) + first = build_transaction(operations[0], file.name, 0, self.account, + self.FLAG) + return first.date + + def extract(self, file, existing_entries=None): + """Extract a list of partially complete transactions from the file.""" + tree = file.convert(gettree) + entries = [] + for i, op in enumerate(get_operations(tree)): + entry = build_transaction(op, file.name, i, self.account, + self.FLAG) + entries.append(entry) + entries = list(reversed(entries)) + baldate = entries[-1].date + datetime.timedelta(days=1) + balmeta = data.new_metadata(file.name, len(entries)) + balentry = data.Balance(balmeta, baldate, self.account, + get_balance(tree), None, None) + entries.append(balentry) + return entries + + +def gettree(filename): + """Extract the HTML attachment and parse it.""" + email_parser = email.parser.BytesParser(policy=email.policy.default) + content = email_parser.parse(open(filename, 'rb')) + body = content.get_body(('html',)) + body.set_charset('UTF-8') + html = body.get_content() + html_parser = etree.HTMLParser() + tree = etree.fromstring(html, html_parser) + return tree + + +def get_header(tree): + """Get the header div with the account details and balance.""" + return tree.xpath( + "//div[@class='OperationMainAccount-headerContentDescription']" + )[0] + + +def get_balance(tree): + """Get the account balance at the date of the capture.""" + header = get_header(tree) + bal = header.xpath( + ".//div[contains(@class,'OperationMainAccount-headerAmountValue')]" + )[0].text + return amount.Amount(parse_amount(bal), 'EUR') + + +def get_operations(tree): + """Get the list of operations.""" + return tree.xpath( + "//ul[@id='bloc-operations']" + "/li[contains(@id,'operation-detail-')]") + + +def parse_date(datestr): + """Parse a date, ignore the time which is always the same.""" + m = re.fullmatch(r'(\w{3} \d{1,2}, \d{4}) 12:00:00 AM', datestr) + return datetime.datetime.strptime(m.group(1), '%b %d, %Y').date() + + +def parse_amount(amountstr): + """Parse an amount in french format.""" + m = re.fullmatch('(?:\\+ )?(-?(?:\\d|\xa0)+),(\\d\\d)\xa0€', amountstr) + ip = m.group(1).replace('\xa0', '') + fp = m.group(2) + return decimal.Decimal('{}.{}'.format(ip, fp)) + + +def build_transaction(op, filename, i, account, flag): + """Build a beancount transaction from a
  • .""" + # Parse operation. + op_id = op.get('id') + m = re.fullmatch(r'operation-detail-(\d+)', op_id) + op_i = int(m.group(1)) + assert i == op_i + op_date = op.xpath("./a/div[@id='dateOperation']")[0].get('aria-label') + op_date = parse_date(op_date) + op_name = op.xpath(".//div[@id='libelleOperation']")[0].text + op_name = op_name.replace("'", ' ') + op_memo = op.xpath(".//div[@class='Operation-descriptionLine']") + if op_memo and op_memo[0].text is not None: + op_memo = op_memo[0].text + op_memo = op_memo.replace("'", ' ') + op_memo = ' '.join(op_memo.split()) + else: + op_memo = None + op_amount = op.xpath(".//div[@id='montant']")[0].text + op_amount = parse_amount(op_amount) + # Prepare beancount transaction. + narration = ' / '.join(filter(None, [op_name, op_memo])) + units = amount.Amount(op_amount, 'EUR') + posting = data.Posting(account, units, None, None, None, None) + metadata = data.new_metadata(filename, i) + payee = None + return data.Transaction(metadata, op_date, flag, payee, narration, + data.EMPTY_SET, data.EMPTY_SET, [posting]) diff --git a/importers/cragr/cragr_ofx.py b/importers/cragr/cragr_ofx.py new file mode 100644 index 0000000..25abb30 --- /dev/null +++ b/importers/cragr/cragr_ofx.py @@ -0,0 +1,48 @@ +"""Importer for Crédit Agricole OFX files. + +Based on OFX importer, clean the narration field.""" +# +# Copyright (C) 2019 Nicolas Schodet +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +from beancount.ingest.importers import ofx +from beancount.core import data +import re + + +class Importer(ofx.Importer): + """An importer for Crédit Agricole OFX files.""" + + def extract(self, file, existing_entries=None): + entries = super().extract(file) + new_entries = [] + for entry in entries: + if isinstance(entry, data.Transaction): + m = re.fullmatch(r'(.*) / (.*) / OTHER', entry.narration) + if m: + name, memo = m.groups() + if memo == '.': + memo = None + else: + memo = ' '.join(memo.split()) + new_narration = ' / '.join(filter(None, [name, memo])) + entry = entry._replace(narration=new_narration) + new_entries.append(entry) + return new_entries -- cgit v1.2.3