You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

416 lines
12 KiB
Python

6 years ago
# Authors: Travis Oliphant, Matthew Brett
"""
Base classes for MATLAB file stream reading.
MATLAB is a registered trademark of the Mathworks inc.
"""
from __future__ import division, print_function, absolute_import
import sys
import operator
from scipy._lib.six import reduce
import numpy as np
if sys.version_info[0] >= 3:
byteord = int
else:
byteord = ord
from scipy.misc import doccer
from . import byteordercodes as boc
class MatReadError(Exception):
pass
class MatWriteError(Exception):
pass
class MatReadWarning(UserWarning):
pass
doc_dict = \
{'file_arg':
'''file_name : str
Name of the mat file (do not need .mat extension if
appendmat==True) Can also pass open file-like object.''',
'append_arg':
'''appendmat : bool, optional
True to append the .mat extension to the end of the given
filename, if not already present.''',
'load_args':
'''byte_order : str or None, optional
None by default, implying byte order guessed from mat
file. Otherwise can be one of ('native', '=', 'little', '<',
'BIG', '>').
mat_dtype : bool, optional
If True, return arrays in same dtype as would be loaded into
MATLAB (instead of the dtype with which they are saved).
squeeze_me : bool, optional
Whether to squeeze unit matrix dimensions or not.
chars_as_strings : bool, optional
Whether to convert char arrays to string arrays.
matlab_compatible : bool, optional
Returns matrices as would be loaded by MATLAB (implies
squeeze_me=False, chars_as_strings=False, mat_dtype=True,
struct_as_record=True).''',
'struct_arg':
'''struct_as_record : bool, optional
Whether to load MATLAB structs as numpy record arrays, or as
old-style numpy arrays with dtype=object. Setting this flag to
False replicates the behavior of scipy version 0.7.x (returning
numpy object arrays). The default setting is True, because it
allows easier round-trip load and save of MATLAB files.''',
'matstream_arg':
'''mat_stream : file-like
Object with file API, open for reading.''',
'long_fields':
'''long_field_names : bool, optional
* False - maximum field name length in a structure is 31 characters
which is the documented maximum length. This is the default.
* True - maximum field name length in a structure is 63 characters
which works for MATLAB 7.6''',
'do_compression':
'''do_compression : bool, optional
Whether to compress matrices on write. Default is False.''',
'oned_as':
'''oned_as : {'row', 'column'}, optional
If 'column', write 1-D numpy arrays as column vectors.
If 'row', write 1D numpy arrays as row vectors.''',
'unicode_strings':
'''unicode_strings : bool, optional
If True, write strings as Unicode, else MATLAB usual encoding.'''}
docfiller = doccer.filldoc(doc_dict)
'''
Note on architecture
======================
There are three sets of parameters relevant for reading files. The
first are *file read parameters* - containing options that are common
for reading the whole file, and therefore every variable within that
file. At the moment these are:
* mat_stream
* dtypes (derived from byte code)
* byte_order
* chars_as_strings
* squeeze_me
* struct_as_record (MATLAB 5 files)
* class_dtypes (derived from order code, MATLAB 5 files)
* codecs (MATLAB 5 files)
* uint16_codec (MATLAB 5 files)
Another set of parameters are those that apply only to the current
variable being read - the *header*:
* header related variables (different for v4 and v5 mat files)
* is_complex
* mclass
* var_stream
With the header, we need ``next_position`` to tell us where the next
variable in the stream is.
Then, for each element in a matrix, there can be *element read
parameters*. An element is, for example, one element in a MATLAB cell
array. At the moment these are:
* mat_dtype
The file-reading object contains the *file read parameters*. The
*header* is passed around as a data object, or may be read and discarded
in a single function. The *element read parameters* - the mat_dtype in
this instance, is passed into a general post-processing function - see
``mio_utils`` for details.
'''
def convert_dtypes(dtype_template, order_code):
''' Convert dtypes in mapping to given order
Parameters
----------
dtype_template : mapping
mapping with values returning numpy dtype from ``np.dtype(val)``
order_code : str
an order code suitable for using in ``dtype.newbyteorder()``
Returns
-------
dtypes : mapping
mapping where values have been replaced by
``np.dtype(val).newbyteorder(order_code)``
'''
dtypes = dtype_template.copy()
for k in dtypes:
dtypes[k] = np.dtype(dtypes[k]).newbyteorder(order_code)
return dtypes
def read_dtype(mat_stream, a_dtype):
"""
Generic get of byte stream data of known type
Parameters
----------
mat_stream : file_like object
MATLAB (tm) mat file stream
a_dtype : dtype
dtype of array to read. `a_dtype` is assumed to be correct
endianness.
Returns
-------
arr : ndarray
Array of dtype `a_dtype` read from stream.
"""
num_bytes = a_dtype.itemsize
arr = np.ndarray(shape=(),
dtype=a_dtype,
buffer=mat_stream.read(num_bytes),
order='F')
return arr
def get_matfile_version(fileobj):
"""
Return major, minor tuple depending on apparent mat file type
Where:
#. 0,x -> version 4 format mat files
#. 1,x -> version 5 format mat files
#. 2,x -> version 7.3 format mat files (HDF format)
Parameters
----------
fileobj : file_like
object implementing seek() and read()
Returns
-------
major_version : {0, 1, 2}
major MATLAB File format version
minor_version : int
minor MATLAB file format version
Raises
------
MatReadError
If the file is empty.
ValueError
The matfile version is unknown.
Notes
-----
Has the side effect of setting the file read pointer to 0
"""
# Mat4 files have a zero somewhere in first 4 bytes
fileobj.seek(0)
mopt_bytes = fileobj.read(4)
if len(mopt_bytes) == 0:
raise MatReadError("Mat file appears to be empty")
mopt_ints = np.ndarray(shape=(4,), dtype=np.uint8, buffer=mopt_bytes)
if 0 in mopt_ints:
fileobj.seek(0)
return (0,0)
# For 5 format or 7.3 format we need to read an integer in the
# header. Bytes 124 through 128 contain a version integer and an
# endian test string
fileobj.seek(124)
tst_str = fileobj.read(4)
fileobj.seek(0)
maj_ind = int(tst_str[2] == b'I'[0])
maj_val = byteord(tst_str[maj_ind])
min_val = byteord(tst_str[1-maj_ind])
ret = (maj_val, min_val)
if maj_val in (1, 2):
return ret
raise ValueError('Unknown mat file type, version %s, %s' % ret)
def matdims(arr, oned_as='column'):
"""
Determine equivalent MATLAB dimensions for given array
Parameters
----------
arr : ndarray
Input array
oned_as : {'column', 'row'}, optional
Whether 1-D arrays are returned as MATLAB row or column matrices.
Default is 'column'.
Returns
-------
dims : tuple
Shape tuple, in the form MATLAB expects it.
Notes
-----
We had to decide what shape a 1 dimensional array would be by
default. ``np.atleast_2d`` thinks it is a row vector. The
default for a vector in MATLAB (e.g. ``>> 1:12``) is a row vector.
Versions of scipy up to and including 0.11 resulted (accidentally)
in 1-D arrays being read as column vectors. For the moment, we
maintain the same tradition here.
Examples
--------
>>> matdims(np.array(1)) # numpy scalar
(1, 1)
>>> matdims(np.array([1])) # 1d array, 1 element
(1, 1)
>>> matdims(np.array([1,2])) # 1d array, 2 elements
(2, 1)
>>> matdims(np.array([[2],[3]])) # 2d array, column vector
(2, 1)
>>> matdims(np.array([[2,3]])) # 2d array, row vector
(1, 2)
>>> matdims(np.array([[[2,3]]])) # 3d array, rowish vector
(1, 1, 2)
>>> matdims(np.array([])) # empty 1d array
(0, 0)
>>> matdims(np.array([[]])) # empty 2d
(0, 0)
>>> matdims(np.array([[[]]])) # empty 3d
(0, 0, 0)
Optional argument flips 1-D shape behavior.
>>> matdims(np.array([1,2]), 'row') # 1d array, 2 elements
(1, 2)
The argument has to make sense though
>>> matdims(np.array([1,2]), 'bizarre')
Traceback (most recent call last):
...
ValueError: 1D option "bizarre" is strange
"""
shape = arr.shape
if shape == (): # scalar
return (1,1)
if reduce(operator.mul, shape) == 0: # zero elememts
return (0,) * np.max([arr.ndim, 2])
if len(shape) == 1: # 1D
if oned_as == 'column':
return shape + (1,)
elif oned_as == 'row':
return (1,) + shape
else:
raise ValueError('1D option "%s" is strange'
% oned_as)
return shape
class MatVarReader(object):
''' Abstract class defining required interface for var readers'''
def __init__(self, file_reader):
pass
def read_header(self):
''' Returns header '''
pass
def array_from_header(self, header):
''' Reads array given header '''
pass
class MatFileReader(object):
""" Base object for reading mat files
To make this class functional, you will need to override the
following methods:
matrix_getter_factory - gives object to fetch next matrix from stream
guess_byte_order - guesses file byte order from file
"""
@docfiller
def __init__(self, mat_stream,
byte_order=None,
mat_dtype=False,
squeeze_me=False,
chars_as_strings=True,
matlab_compatible=False,
struct_as_record=True,
verify_compressed_data_integrity=True
):
'''
Initializer for mat file reader
mat_stream : file-like
object with file API, open for reading
%(load_args)s
'''
# Initialize stream
self.mat_stream = mat_stream
self.dtypes = {}
if not byte_order:
byte_order = self.guess_byte_order()
else:
byte_order = boc.to_numpy_code(byte_order)
self.byte_order = byte_order
self.struct_as_record = struct_as_record
if matlab_compatible:
self.set_matlab_compatible()
else:
self.squeeze_me = squeeze_me
self.chars_as_strings = chars_as_strings
self.mat_dtype = mat_dtype
self.verify_compressed_data_integrity = verify_compressed_data_integrity
def set_matlab_compatible(self):
''' Sets options to return arrays as MATLAB loads them '''
self.mat_dtype = True
self.squeeze_me = False
self.chars_as_strings = False
def guess_byte_order(self):
''' As we do not know what file type we have, assume native '''
return boc.native_code
def end_of_stream(self):
b = self.mat_stream.read(1)
curpos = self.mat_stream.tell()
self.mat_stream.seek(curpos-1)
return len(b) == 0
def arr_dtype_number(arr, num):
''' Return dtype for given number of items per element'''
return np.dtype(arr.dtype.str[:2] + str(num))
def arr_to_chars(arr):
''' Convert string array to char array '''
dims = list(arr.shape)
if not dims:
dims = [1]
dims.append(int(arr.dtype.str[2:]))
arr = np.ndarray(shape=dims,
dtype=arr_dtype_number(arr, 1),
buffer=arr)
empties = [arr == '']
if not np.any(empties):
return arr
arr = arr.copy()
arr[tuple(empties)] = ' '
return arr