blob: 3fe5adf92a40fa20a7e1865c65857c6eabfc916c [file] [log] [blame]
"""
| Copyright (C) 2014-2017 Daniel Thiele
| TU Braunschweig, Germany
| All rights reserved.
| See LICENSE file for copyright and license details.
:Authors:
- Daniel Thiele (thiele@ida.ing.tu-bs.de)
Description
-----------
Parser XLS(X) file to dictionary
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import re
import xlrd
def xls_letter_to_col(self, letters):
""" Excel uses upper-case letters as column identifiers. This function
converts them to integers, i.e. A -> 0, ..., AA -> 26, etc.
"""
assert re.match(re.compile(r"^[A-Z]*$"), letters), \
'Invalid column ID: %s' % str(letters)
sum = 0
i = 1
for c in letters[::-1]:
sum += i * (ord(c) + 1 - ord('A'))
i *= 26
assert sum > 0
return sum - 1
class XLS_parser(object):
""" Parse XLS workbook. Data is stored in a dict-list-dict hierarchy,
i.e. self.sheets is a dict of worksheets, which contain a list of
all data lines of that worksheet. Each of these lines is a dict
using the entries of the header line as keys. The header is
expected to be in the first line. All other entries are treated
as data.
"""
def __init__(self, filename):
self.filename = filename
self.workbook = None
self.header = dict() # dict of headers
self.sheets = dict() # dict of worksheets
def get_line_of_sheet(self, sheet_name, line, use_xls_line_numbers=False):
""" Get a certain line of a worksheet. Indexing, by default, starts
at 0. Set use_xls_line_numbers to True to use indexing of XLS file.
"""
if use_xls_line_numbers:
assert line >= 2
line -= 2
return self.sheets[sheet_name][line]
def get_line_entry_of_sheet(self, sheet_name, line, element_name, use_xls_line_numbers=False):
""" Get a certain element of the given line in a sheet.
"""
return self.get_line_of_sheet(sheet_name, line, \
use_xls_line_numbers)[element_name]
def parse(self):
""" Parse XLS(X) file.
"""
self.workbook = xlrd.open_workbook(self.filename)
# Iterate over all sheets
for sn in self.workbook.sheet_names():
self.sheets[sn] = self._parse_worksheet(sn)
return self.sheets
def _parse_worksheet(self, worksheet_name):
""" Parse a given worksheet.
"""
sheet = self.workbook.sheet_by_name(worksheet_name)
result = list()
self.header[worksheet_name] = sheet.row_values(0)
assert len(self.header[worksheet_name]) == len(set(self.header[worksheet_name])), \
"Worksheet header of \"%s\" appears to have duplicate entries." % worksheet_name
for l in range(1, sheet.nrows):
line_dict = dict()
row_values = sheet.row_values(l)
for i in range(0, len(row_values)):
line_dict[str(self.header[worksheet_name][i])] = row_values[i]
result.append(line_dict)
return result
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4