From 19aabc070bf7a159068c40f51b9f6b38cbeeb083 Mon Sep 17 00:00:00 2001
From: Nicolas Schodet
Date: Mon, 30 Sep 2019 00:09:07 +0200
Subject: Importers for Crédit Agricole OFX export and MHTML page dump

---
 importers/.gitignore           |   1 +
 importers/__init__.py          |   0
 importers/conftest.py          |   2 +
 importers/cragr/__init__.py    |   0
 importers/cragr/cragr_mhtml.py | 178 +++++++++++++++++++++++++++++++++++++++++
 importers/cragr/cragr_ofx.py   |  48 +++++++++++
 6 files changed, 229 insertions(+)
 create mode 100644 importers/.gitignore
 create mode 100644 importers/__init__.py
 create mode 100644 importers/conftest.py
 create mode 100644 importers/cragr/__init__.py
 create mode 100644 importers/cragr/cragr_mhtml.py
 create mode 100644 importers/cragr/cragr_ofx.py

diff --git a/importers/.gitignore b/importers/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/importers/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/importers/__init__.py b/importers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/importers/conftest.py b/importers/conftest.py
new file mode 100644
index 0000000..a763223
--- /dev/null
+++ b/importers/conftest.py
@@ -0,0 +1,2 @@
+# This adds the --generate option.
+pytest_plugins = "beancount.ingest.regression_pytest"
diff --git a/importers/cragr/__init__.py b/importers/cragr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/importers/cragr/cragr_mhtml.py b/importers/cragr/cragr_mhtml.py
new file mode 100644
index 0000000..280d25c
--- /dev/null
+++ b/importers/cragr/cragr_mhtml.py
@@ -0,0 +1,178 @@
+"""Importer for Crédit Agricole webpages."""
+#
+# Copyright (C) 2019 Nicolas Schodet
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+import datetime
+import decimal
+import email.parser
+import email.policy
+import os.path
+import re
+from lxml import etree
+
+from beancount.ingest import importer
+from beancount.core import data, amount
+
+
+class Importer(importer.ImporterProtocol):
+    """An importer for Crédit Agricole webpages saved as MHTML."""
+
+    def __init__(self, account_number, account, basename=None):
+        """Create a new importer posting to the given account.
+
+        Args:
+          account_number: Account number.
+          account: An account string, the account onto which to post all the
+            amounts parsed.
+          basename: An optional string, the name of the new files.
+        """
+        self.account_number = account_number
+        self.account = account
+        self.basename = basename
+
+    def name(self):
+        """Include the filing account in the name."""
+        return '{}: "{}"'.format(super().name(), self.file_account(None))
+
+    def identify(self, file):
+        """Check whether a file is handled by this importer."""
+        if file.mimetype() != 'message/rfc822':
+            return False
+        tree = file.convert(gettree)
+        accounts = get_header(tree).xpath(
+            ".//div[contains(@class,'OperationMainAccount-headerNumber')]"
+            "/text()")
+        if accounts and re.match(r' *N° +' + self.account_number,
+                                 accounts[0]):
+            return True
+        return False
+
+    def file_account(self, _):
+        """Return the account against which we post transactions."""
+        return self.account
+
+    def file_name(self, file):
+        """Return the optional renamed account filename."""
+        if self.basename:
+            return self.basename + os.path.splitext(file.name)[1]
+
+    def file_date(self, file):
+        """Return the file date."""
+        tree = file.convert(gettree)
+        operations = get_operations(tree)
+        first = build_transaction(operations[0], file.name, 0, self.account,
+                                  self.FLAG)
+        return first.date
+
+    def extract(self, file, existing_entries=None):
+        """Extract a list of partially complete transactions from the file."""
+        tree = file.convert(gettree)
+        entries = []
+        for i, op in enumerate(get_operations(tree)):
+            entry = build_transaction(op, file.name, i, self.account,
+                                      self.FLAG)
+            entries.append(entry)
+        entries = list(reversed(entries))
+        baldate = entries[-1].date + datetime.timedelta(days=1)
+        balmeta = data.new_metadata(file.name, len(entries))
+        balentry = data.Balance(balmeta, baldate, self.account,
+                                get_balance(tree), None, None)
+        entries.append(balentry)
+        return entries
+
+
+def gettree(filename):
+    """Extract the HTML attachment and parse it."""
+    email_parser = email.parser.BytesParser(policy=email.policy.default)
+    content = email_parser.parse(open(filename, 'rb'))
+    body = content.get_body(('html',))
+    body.set_charset('UTF-8')
+    html = body.get_content()
+    html_parser = etree.HTMLParser()
+    tree = etree.fromstring(html, html_parser)
+    return tree
+
+
+def get_header(tree):
+    """Get the header div with the account details and balance."""
+    return tree.xpath(
+        "//div[@class='OperationMainAccount-headerContentDescription']"
+    )[0]
+
+
+def get_balance(tree):
+    """Get the account balance at the date of the capture."""
+    header = get_header(tree)
+    bal = header.xpath(
+        ".//div[contains(@class,'OperationMainAccount-headerAmountValue')]"
+    )[0].text
+    return amount.Amount(parse_amount(bal), 'EUR')
+
+
+def get_operations(tree):
+    """Get the list of operations."""
+    return tree.xpath(
+        "//ul[@id='bloc-operations']"
+        "/li[contains(@id,'operation-detail-')]")
+
+
+def parse_date(datestr):
+    """Parse a date, ignore the time which is always the same."""
+    m = re.fullmatch(r'(\w{3} \d{1,2}, \d{4}) 12:00:00 AM', datestr)
+    return datetime.datetime.strptime(m.group(1), '%b %d, %Y').date()
+
+
+def parse_amount(amountstr):
+    """Parse an amount in french format."""
+    m = re.fullmatch('(?:\\+ )?(-?(?:\\d|\xa0)+),(\\d\\d)\xa0€', amountstr)
+    ip = m.group(1).replace('\xa0', '')
+    fp = m.group(2)
+    return decimal.Decimal('{}.{}'.format(ip, fp))
+
+
+def build_transaction(op, filename, i, account, flag):
+    """Build a beancount transaction from a <li>."""
+    # Parse operation.
+    op_id = op.get('id')
+    m = re.fullmatch(r'operation-detail-(\d+)', op_id)
+    op_i = int(m.group(1))
+    assert i == op_i
+    op_date = op.xpath("./a/div[@id='dateOperation']")[0].get('aria-label')
+    op_date = parse_date(op_date)
+    op_name = op.xpath(".//div[@id='libelleOperation']")[0].text
+    op_name = op_name.replace("'", ' ')
+    op_memo = op.xpath(".//div[@class='Operation-descriptionLine']")
+    if op_memo and op_memo[0].text is not None:
+        op_memo = op_memo[0].text
+        op_memo = op_memo.replace("'", ' ')
+        op_memo = ' '.join(op_memo.split())
+    else:
+        op_memo = None
+    op_amount = op.xpath(".//div[@id='montant']")[0].text
+    op_amount = parse_amount(op_amount)
+    # Prepare beancount transaction.
+    narration = ' / '.join(filter(None, [op_name, op_memo]))
+    units = amount.Amount(op_amount, 'EUR')
+    posting = data.Posting(account, units, None, None, None, None)
+    metadata = data.new_metadata(filename, i)
+    payee = None
+    return data.Transaction(metadata, op_date, flag, payee, narration,
+                            data.EMPTY_SET, data.EMPTY_SET, [posting])
diff --git a/importers/cragr/cragr_ofx.py b/importers/cragr/cragr_ofx.py
new file mode 100644
index 0000000..25abb30
--- /dev/null
+++ b/importers/cragr/cragr_ofx.py
@@ -0,0 +1,48 @@
+"""Importer for Crédit Agricole OFX files.
+
+Based on OFX importer, clean the narration field."""
+#
+# Copyright (C) 2019 Nicolas Schodet
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+from beancount.ingest.importers import ofx
+from beancount.core import data
+import re
+
+
+class Importer(ofx.Importer):
+    """An importer for Crédit Agricole OFX files."""
+
+    def extract(self, file, existing_entries=None):
+        entries = super().extract(file)
+        new_entries = []
+        for entry in entries:
+            if isinstance(entry, data.Transaction):
+                m = re.fullmatch(r'(.*) / (.*) / OTHER', entry.narration)
+                if m:
+                    name, memo = m.groups()
+                    if memo == '.':
+                        memo = None
+                    else:
+                        memo = ' '.join(memo.split())
+                    new_narration = ' / '.join(filter(None, [name, memo]))
+                    entry = entry._replace(narration=new_narration)
+            new_entries.append(entry)
+        return new_entries
-- 
cgit v1.2.3