flectra/addons/base_import/models/base_import.py
2018-07-13 09:30:51 +00:00

706 lines
30 KiB
Python

# -*- coding: utf-8 -*-
# Part of Odoo, Flectra. See LICENSE file for full copyright and licensing details.
import datetime
import io
import itertools
import logging
import psycopg2
import operator
import os
import re
from flectra import api, fields, models
from flectra.tools.translate import _
from flectra.tools.mimetypes import guess_mimetype
from flectra.tools.misc import ustr
from flectra.tools import DEFAULT_SERVER_DATE_FORMAT, DEFAULT_SERVER_DATETIME_FORMAT, pycompat
FIELDS_RECURSION_LIMIT = 2
ERROR_PREVIEW_BYTES = 200
_logger = logging.getLogger(__name__)
try:
import xlrd
try:
from xlrd import xlsx
except ImportError:
xlsx = None
except ImportError:
xlrd = xlsx = None
try:
from . import odf_ods_reader
except ImportError:
odf_ods_reader = None
FILE_TYPE_DICT = {
'text/csv': ('csv', True, None),
'application/vnd.ms-excel': ('xls', xlrd, 'xlrd'),
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ('xlsx', xlsx, 'xlrd >= 1.0.0'),
'application/vnd.oasis.opendocument.spreadsheet': ('ods', odf_ods_reader, 'odfpy')
}
EXTENSIONS = {
'.' + ext: handler
for mime, (ext, handler, req) in FILE_TYPE_DICT.items()
}
class Import(models.TransientModel):
_name = 'base_import.import'
# allow imports to survive for 12h in case user is slow
_transient_max_hours = 12.0
res_model = fields.Char('Model')
file = fields.Binary('File', help="File to check and/or import, raw binary (not base64)")
file_name = fields.Char('File Name')
file_type = fields.Char('File Type')
@api.model
def get_fields(self, model, depth=FIELDS_RECURSION_LIMIT):
""" Recursively get fields for the provided model (through
fields_get) and filter them according to importability
The output format is a list of ``Field``, with ``Field``
defined as:
.. class:: Field
.. attribute:: id (str)
A non-unique identifier for the field, used to compute
the span of the ``required`` attribute: if multiple
``required`` fields have the same id, only one of them
is necessary.
.. attribute:: name (str)
The field's logical (Flectra) name within the scope of
its parent.
.. attribute:: string (str)
The field's human-readable name (``@string``)
.. attribute:: required (bool)
Whether the field is marked as required in the
model. Clients must provide non-empty import values
for all required fields or the import will error out.
.. attribute:: fields (list(Field))
The current field's subfields. The database and
external identifiers for m2o and m2m fields; a
filtered and transformed fields_get for o2m fields (to
a variable depth defined by ``depth``).
Fields with no sub-fields will have an empty list of
sub-fields.
:param str model: name of the model to get fields form
:param int landing: depth of recursion into o2m fields
"""
Model = self.env[model]
importable_fields = [{
'id': 'id',
'name': 'id',
'string': _("External ID"),
'required': False,
'fields': [],
'type': 'id',
}]
model_fields = Model.fields_get()
blacklist = models.MAGIC_COLUMNS + [Model.CONCURRENCY_CHECK_FIELD]
for name, field in model_fields.items():
if name in blacklist:
continue
# an empty string means the field is deprecated, @deprecated must
# be absent or False to mean not-deprecated
if field.get('deprecated', False) is not False:
continue
if field.get('readonly'):
states = field.get('states')
if not states:
continue
# states = {state: [(attr, value), (attr2, value2)], state2:...}
if not any(attr == 'readonly' and value is False
for attr, value in itertools.chain.from_iterable(states.values())):
continue
field_value = {
'id': name,
'name': name,
'string': field['string'],
# Y U NO ALWAYS HAS REQUIRED
'required': bool(field.get('required')),
'fields': [],
'type': field['type'],
}
if field['type'] in ('many2many', 'many2one'):
field_value['fields'] = [
dict(field_value, name='id', string=_("External ID"), type='id'),
dict(field_value, name='.id', string=_("Database ID"), type='id'),
]
elif field['type'] == 'one2many' and depth:
field_value['fields'] = self.get_fields(field['relation'], depth=depth-1)
if self.user_has_groups('base.group_no_one'):
field_value['fields'].append({'id': '.id', 'name': '.id', 'string': _("Database ID"), 'required': False, 'fields': [], 'type': 'id'})
importable_fields.append(field_value)
# TODO: cache on model?
return importable_fields
@api.multi
def _read_file(self, options):
""" Dispatch to specific method to read file content, according to its mimetype or file type
:param options : dict of reading options (quoting, separator, ...)
"""
self.ensure_one()
# guess mimetype from file content
mimetype = guess_mimetype(self.file)
(file_extension, handler, req) = FILE_TYPE_DICT.get(mimetype, (None, None, None))
if handler:
try:
return getattr(self, '_read_' + file_extension)(options)
except Exception:
_logger.warn("Failed to read file '%s' (transient id %d) using guessed mimetype %s", self.file_name or '<unknown>', self.id, mimetype)
# try reading with user-provided mimetype
(file_extension, handler, req) = FILE_TYPE_DICT.get(self.file_type, (None, None, None))
if handler:
try:
return getattr(self, '_read_' + file_extension)(options)
except Exception:
_logger.warn("Failed to read file '%s' (transient id %d) using user-provided mimetype %s", self.file_name or '<unknown>', self.id, self.file_type)
# fallback on file extensions as mime types can be unreliable (e.g.
# software setting incorrect mime types, or non-installed software
# leading to browser not sending mime types)
if self.file_name:
p, ext = os.path.splitext(self.file_name)
if ext in EXTENSIONS:
try:
return getattr(self, '_read_' + ext[1:])(options)
except Exception:
_logger.warn("Failed to read file '%s' (transient id %s) using file extension", self.file_name, self.id)
if req:
raise ImportError(_("Unable to load \"{extension}\" file: requires Python module \"{modname}\"").format(extension=file_extension, modname=req))
raise ValueError(_("Unsupported file format \"{}\", import only supports CSV, ODS, XLS and XLSX").format(self.file_type))
@api.multi
def _read_xls(self, options):
""" Read file content, using xlrd lib """
book = xlrd.open_workbook(file_contents=self.file)
return self._read_xls_book(book)
def _read_xls_book(self, book):
sheet = book.sheet_by_index(0)
# emulate Sheet.get_rows for pre-0.9.4
for row in pycompat.imap(sheet.row, range(sheet.nrows)):
values = []
for cell in row:
if cell.ctype is xlrd.XL_CELL_NUMBER:
is_float = cell.value % 1 != 0.0
values.append(
pycompat.text_type(cell.value)
if is_float
else pycompat.text_type(int(cell.value))
)
elif cell.ctype is xlrd.XL_CELL_DATE:
is_datetime = cell.value % 1 != 0.0
# emulate xldate_as_datetime for pre-0.9.3
dt = datetime.datetime(*xlrd.xldate.xldate_as_tuple(cell.value, book.datemode))
values.append(
dt.strftime(DEFAULT_SERVER_DATETIME_FORMAT)
if is_datetime
else dt.strftime(DEFAULT_SERVER_DATE_FORMAT)
)
elif cell.ctype is xlrd.XL_CELL_BOOLEAN:
values.append(u'True' if cell.value else u'False')
elif cell.ctype is xlrd.XL_CELL_ERROR:
raise ValueError(
_("Error cell found while reading XLS/XLSX file: %s") %
xlrd.error_text_from_code.get(
cell.value, "unknown error code %s" % cell.value)
)
else:
values.append(cell.value)
if any(x for x in values if x.strip()):
yield values
# use the same method for xlsx and xls files
_read_xlsx = _read_xls
@api.multi
def _read_ods(self, options):
""" Read file content using ODSReader custom lib """
doc = odf_ods_reader.ODSReader(file=io.BytesIO(self.file))
return (
row
for row in doc.getFirstSheet()
if any(x for x in row if x.strip())
)
@api.multi
def _read_csv(self, options):
""" Returns a CSV-parsed iterator of all empty lines in the file
:throws csv.Error: if an error is detected during CSV parsing
:throws UnicodeDecodeError: if ``options.encoding`` is incorrect
"""
csv_data = self.file
# TODO: guess encoding with chardet? Or https://github.com/aadsm/jschardet
encoding = options.get('encoding', 'utf-8')
if encoding != 'utf-8':
# csv module expect utf-8, see http://docs.python.org/2/library/csv.html
csv_data = csv_data.decode(encoding).encode('utf-8')
csv_iterator = pycompat.csv_reader(
io.BytesIO(csv_data),
quotechar=str(options['quoting']),
delimiter=str(options['separator']))
return (
row for row in csv_iterator
if any(x for x in row if x.strip())
)
@api.model
def _try_match_column(self, preview_values, options):
""" Returns the potential field types, based on the preview values, using heuristics
:param preview_values : list of value for the column to determine
:param options : parsing options
"""
# If all values are empty in preview than can be any field
if all([v == '' for v in preview_values]):
return ['all']
# If all values starts with __export__ this is probably an id
if all(v.startswith('__export__') for v in preview_values):
return ['id', 'many2many', 'many2one', 'one2many']
# If all values can be cast to int type is either id, float or monetary
# Exception: if we only have 1 and 0, it can also be a boolean
try:
field_type = ['id', 'integer', 'char', 'float', 'monetary', 'many2one', 'many2many', 'one2many']
res = set(int(v) for v in preview_values if v)
if {0, 1}.issuperset(res):
field_type.append('boolean')
return field_type
except ValueError:
pass
# If all values are either True or False, type is boolean
if all(val.lower() in ('true', 'false', 't', 'f', '') for val in preview_values):
return ['boolean']
# If all values can be cast to float, type is either float or monetary
# Or a date/datetime if it matches the pattern
results = []
try:
thousand_separator = decimal_separator = False
for val in preview_values:
val = val.strip()
if not val:
continue
# value might have the currency symbol left or right from the value
val = self._remove_currency_symbol(val)
if val:
if options.get('float_thousand_separator') and options.get('float_decimal_separator'):
val = val.replace(options['float_thousand_separator'], '').replace(options['float_decimal_separator'], '.')
# We are now sure that this is a float, but we still need to find the
# thousand and decimal separator
else:
if val.count('.') > 1:
options['float_thousand_separator'] = '.'
options['float_decimal_separator'] = ','
elif val.count(',') > 1:
options['float_thousand_separator'] = ','
options['float_decimal_separator'] = '.'
elif val.find('.') > val.find(','):
thousand_separator = ','
decimal_separator = '.'
elif val.find(',') > val.find('.'):
thousand_separator = '.'
decimal_separator = ','
else:
# This is not a float so exit this try
float('a')
if thousand_separator and not options.get('float_decimal_separator'):
options['float_thousand_separator'] = thousand_separator
options['float_decimal_separator'] = decimal_separator
results = ['float', 'monetary']
except ValueError:
pass
# Try to see if all values are a date or datetime
dt = datetime.datetime
separator = [' ', '/', '-']
date_format = ['%mr%dr%Y', '%dr%mr%Y', '%Yr%mr%d', '%Yr%dr%m']
date_patterns = [options['date_format']] if options.get('date_format') else []
if not date_patterns:
date_patterns = [pattern.replace('r', sep) for sep in separator for pattern in date_format]
date_patterns.extend([p.replace('Y', 'y') for p in date_patterns])
datetime_patterns = [options['datetime_format']] if options.get('datetime_format') else []
if not datetime_patterns:
datetime_patterns = [pattern + ' %H:%M:%S' for pattern in date_patterns]
current_date_pattern = False
current_datetime_pattern = False
def check_patterns(patterns, preview_values):
for pattern in patterns:
match = True
for val in preview_values:
if not val:
continue
try:
dt.strptime(val, pattern)
except ValueError:
match = False
break
if match:
return pattern
return False
current_date_pattern = check_patterns(date_patterns, preview_values)
if current_date_pattern:
options['date_format'] = current_date_pattern
results += ['date']
current_datetime_pattern = check_patterns(datetime_patterns, preview_values)
if current_datetime_pattern:
options['datetime_format'] = current_datetime_pattern
results += ['datetime']
if results:
return results
return ['id', 'text', 'char', 'datetime', 'selection', 'many2one', 'one2many', 'many2many', 'html']
@api.model
def _find_type_from_preview(self, options, preview):
type_fields = []
if preview:
for column in range(0, len(preview[0])):
preview_values = [value[column].strip() for value in preview]
type_field = self._try_match_column(preview_values, options)
type_fields.append(type_field)
return type_fields
def _match_header(self, header, fields, options):
""" Attempts to match a given header to a field of the
imported model.
:param str header: header name from the CSV file
:param fields:
:param dict options:
:returns: an empty list if the header couldn't be matched, or
all the fields to traverse
:rtype: list(Field)
"""
string_match = None
for field in fields:
# FIXME: should match all translations & original
# TODO: use string distance (levenshtein? hamming?)
if header.lower() == field['name'].lower():
return [field]
if header.lower() == field['string'].lower():
# matching string are not reliable way because
# strings have no unique constraint
string_match = field
if string_match:
# this behavior is only applied if there is no matching field['name']
return [string_match]
if '/' not in header:
return []
# relational field path
traversal = []
subfields = fields
# Iteratively dive into fields tree
for section in header.split('/'):
# Strip section in case spaces are added around '/' for
# readability of paths
match = self._match_header(section.strip(), subfields, options)
# Any match failure, exit
if not match:
return []
# prep subfields for next iteration within match[0]
field = match[0]
subfields = field['fields']
traversal.append(field)
return traversal
def _match_headers(self, rows, fields, options):
""" Attempts to match the imported model's fields to the
titles of the parsed CSV file, if the file is supposed to have
headers.
Will consume the first line of the ``rows`` iterator.
Returns a pair of (None, None) if headers were not requested
or the list of headers and a dict mapping cell indices
to key paths in the ``fields`` tree
:param Iterator rows:
:param dict fields:
:param dict options:
:rtype: (None, None) | (list(str), dict(int: list(str)))
"""
if not options.get('headers'):
return [], {}
headers = next(rows)
return headers, {
index: [field['name'] for field in self._match_header(header, fields, options)] or None
for index, header in enumerate(headers)
}
@api.multi
def parse_preview(self, options, count=10):
""" Generates a preview of the uploaded files, and performs
fields-matching between the import's file data and the model's
columns.
If the headers are not requested (not options.headers),
``matches`` and ``headers`` are both ``False``.
:param int count: number of preview lines to generate
:param options: format-specific options.
CSV: {encoding, quoting, separator, headers}
:type options: {str, str, str, bool}
:returns: {fields, matches, headers, preview} | {error, preview}
:rtype: {dict(str: dict(...)), dict(int, list(str)), list(str), list(list(str))} | {str, str}
"""
self.ensure_one()
fields = self.get_fields(self.res_model)
try:
rows = self._read_file(options)
headers, matches = self._match_headers(rows, fields, options)
# Match should have consumed the first row (iif headers), get
# the ``count`` next rows for preview
preview = list(itertools.islice(rows, count))
assert preview, "CSV file seems to have no content"
header_types = self._find_type_from_preview(options, preview)
if options.get('keep_matches', False) and len(options.get('fields', [])):
matches = {}
for index, match in enumerate(options.get('fields')):
if match:
matches[index] = match.split('/')
return {
'fields': fields,
'matches': matches or False,
'headers': headers or False,
'headers_type': header_types or False,
'preview': preview,
'options': options,
'debug': self.user_has_groups('base.group_no_one'),
}
except Exception as error:
# Due to lazy generators, UnicodeDecodeError (for
# instance) may only be raised when serializing the
# preview to a list in the return.
_logger.debug("Error during parsing preview", exc_info=True)
preview = None
if self.file_type == 'text/csv':
preview = self.file[:ERROR_PREVIEW_BYTES].decode('iso-8859-1')
return {
'error': str(error),
# iso-8859-1 ensures decoding will always succeed,
# even if it yields non-printable characters. This is
# in case of UnicodeDecodeError (or csv.Error
# compounded with UnicodeDecodeError)
'preview': preview,
}
@api.model
def _convert_import_data(self, fields, options):
""" Extracts the input BaseModel and fields list (with
``False``-y placeholders for fields to *not* import) into a
format Model.import_data can use: a fields list without holes
and the precisely matching data matrix
:param list(str|bool): fields
:returns: (data, fields)
:rtype: (list(list(str)), list(str))
:raises ValueError: in case the import data could not be converted
"""
# Get indices for non-empty fields
indices = [index for index, field in enumerate(fields) if field]
if not indices:
raise ValueError(_("You must configure at least one field to import"))
# If only one index, itemgetter will return an atom rather
# than a 1-tuple
if len(indices) == 1:
mapper = lambda row: [row[indices[0]]]
else:
mapper = operator.itemgetter(*indices)
# Get only list of actually imported fields
import_fields = [f for f in fields if f]
rows_to_import = self._read_file(options)
if options.get('headers'):
rows_to_import = itertools.islice(rows_to_import, 1, None)
data = [
list(row) for row in pycompat.imap(mapper, rows_to_import)
# don't try inserting completely empty rows (e.g. from
# filtering out o2m fields)
if any(row)
]
return data, import_fields
@api.model
def _remove_currency_symbol(self, value):
value = value.strip()
negative = False
# Careful that some countries use () for negative so replace it by - sign
if value.startswith('(') and value.endswith(')'):
value = value[1:-1]
negative = True
float_regex = re.compile(r'([-]?[0-9.,]+)')
split_value = [g for g in float_regex.split(value) if g]
if len(split_value) > 2:
# This is probably not a float
return False
if len(split_value) == 1:
if float_regex.search(split_value[0]) is not None:
return split_value[0] if not negative else '-' + split_value[0]
return False
else:
# String has been split in 2, locate which index contains the float and which does not
currency_index = 0
if float_regex.search(split_value[0]) is not None:
currency_index = 1
# Check that currency exists
currency = self.env['res.currency'].search([('symbol', '=', split_value[currency_index].strip())])
if len(currency):
return split_value[(currency_index + 1) % 2] if not negative else '-' + split_value[(currency_index + 1) % 2]
# Otherwise it is not a float with a currency symbol
return False
@api.model
def _parse_float_from_data(self, data, index, name, options):
thousand_separator = options.get('float_thousand_separator', ' ')
decimal_separator = options.get('float_decimal_separator', '.')
for line in data:
line[index] = line[index].strip()
if not line[index]:
continue
line[index] = line[index].replace(thousand_separator, '').replace(decimal_separator, '.')
old_value = line[index]
line[index] = self._remove_currency_symbol(line[index])
if line[index] is False:
raise ValueError(_("Column %s contains incorrect values (value: %s)" % (name, old_value)))
@api.multi
def _parse_import_data(self, data, import_fields, options):
""" Lauch first call to _parse_import_data_recursive with an
empty prefix. _parse_import_data_recursive will be run
recursively for each relational field.
"""
return self._parse_import_data_recursive(self.res_model, '', data, import_fields, options)
@api.multi
def _parse_import_data_recursive(self, model, prefix, data, import_fields, options):
# Get fields of type date/datetime
all_fields = self.env[model].fields_get()
for name, field in all_fields.items():
name = prefix + name
if field['type'] in ('date', 'datetime') and name in import_fields:
# Parse date
index = import_fields.index(name)
dt = datetime.datetime
server_format = DEFAULT_SERVER_DATE_FORMAT if field['type'] == 'date' else DEFAULT_SERVER_DATETIME_FORMAT
if options.get('%s_format' % field['type'], server_format) != server_format:
# datetime.str[fp]time takes *native strings* in both
# versions, for both data and pattern
user_format = pycompat.to_native(options.get('%s_format' % field['type']))
for num, line in enumerate(data):
if line[index]:
line[index] = line[index].strip()
if line[index]:
try:
line[index] = dt.strftime(dt.strptime(pycompat.to_native(line[index]), user_format), server_format)
except ValueError as e:
raise ValueError(_("Column %s contains incorrect values. Error in line %d: %s") % (name, num + 1, e))
except Exception as e:
raise ValueError(_("Error Parsing Date [%s:L%d]: %s") % (name, num + 1, e))
# Check if the field is in import_field and is a relational (followed by /)
# Also verify that the field name exactly match the import_field at the correct level.
elif any(name + '/' in import_field and name == import_field.split('/')[prefix.count('/')] for import_field in import_fields):
# Recursive call with the relational as new model and add the field name to the prefix
self._parse_import_data_recursive(field['relation'], name + '/', data, import_fields, options)
elif field['type'] in ('float', 'monetary') and name in import_fields:
# Parse float, sometimes float values from file have currency symbol or () to denote a negative value
# We should be able to manage both case
index = import_fields.index(name)
self._parse_float_from_data(data, index, name, options)
return data
@api.multi
def do(self, fields, options, dryrun=False):
""" Actual execution of the import
:param fields: import mapping: maps each column to a field,
``False`` for the columns to ignore
:type fields: list(str|bool)
:param dict options:
:param bool dryrun: performs all import operations (and
validations) but rollbacks writes, allows
getting as much errors as possible without
the risk of clobbering the database.
:returns: A list of errors. If the list is empty the import
executed fully and correctly. If the list is
non-empty it contains dicts with 3 keys ``type`` the
type of error (``error|warning``); ``message`` the
error message associated with the error (a string)
and ``record`` the data which failed to import (or
``false`` if that data isn't available or provided)
:rtype: list({type, message, record})
"""
self.ensure_one()
self._cr.execute('SAVEPOINT import')
try:
data, import_fields = self._convert_import_data(fields, options)
# Parse date and float field
data = self._parse_import_data(data, import_fields, options)
except ValueError as error:
return [{
'type': 'error',
'message': pycompat.text_type(error),
'record': False,
}]
_logger.info('importing %d rows...', len(data))
model = self.env[self.res_model].with_context(import_file=True)
defer_parent_store = self.env.context.get('defer_parent_store_computation', True)
if defer_parent_store and model._parent_store:
model = model.with_context(defer_parent_store_computation=True)
import_result = model.load(import_fields, data)
_logger.info('done')
# If transaction aborted, RELEASE SAVEPOINT is going to raise
# an InternalError (ROLLBACK should work, maybe). Ignore that.
# TODO: to handle multiple errors, create savepoint around
# write and release it in case of write error (after
# adding error to errors array) => can keep on trying to
# import stuff, and rollback at the end if there is any
# error in the results.
try:
if dryrun:
self._cr.execute('ROLLBACK TO SAVEPOINT import')
else:
self._cr.execute('RELEASE SAVEPOINT import')
except psycopg2.InternalError:
pass
return import_result['messages']