Source code for mldas.explore.loading

__copyright__ = """
Machine Learning for Distributed Acoustic Sensing data (MLDAS)
Copyright (c) 2020, The Regents of the University of California,
through Lawrence Berkeley National Laboratory (subject to receipt of
any required approvals from the U.S. Dept. of Energy). All rights reserved.

If you have questions about your rights to use or distribute this software,
please contact Berkeley Lab's Intellectual Property Office at
IPO@lbl.gov.

NOTICE.  This Software was developed under funding from the U.S. Department
of Energy and the U.S. Government consequently retains certain rights.  As
such, the U.S. Government has been granted for itself and others acting on
its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
Software to reproduce, distribute copies to the public, prepare derivative 
works, and perform publicly and display publicly, and to permit others to do so.
"""
__license__ = "Modified BSD license (see LICENSE.txt)"
__maintainer__ = "Vincent Dumont"
__email__ = "vincentdumont11@gmail.com"

# Systems
import os
import glob
import random

# Externals
import h5py
import numpy
import torch
from collections import OrderedDict
from torchvision import transforms
from PIL import Image

[docs]def hdf5read(fname,key=['dsi30','data','variable']):
    """
    Read input raw DAS file. The file is supposed to have an HDF5 format and the
    actual data stored in a group that have one of the following names: ``dsi30``,
    ``data`` or ``variable``.

    Parameters
    ----------
    fname : :py:class:`str`
      Full path to the HDF5 file
    key : :py:class:`str` or :py:class:`list`
      Group's key name where the data are stored. If not specify, a list of keys
      will be looped over.

    Returns
    -------
    data : :py:class:`numpy.ndarray`
      Raw data
    """
    f = h5py.File(fname,'r')
    if type(key)==str:
        data = f[f[key][0,0]]
    else:
        assert any(keyname in f.keys() for keyname in key), 'No acceptable key found in input HDF5 file.'
        for keyname in f.keys():
            if keyname in key:
                data = f[f['%s/dat'%keyname][0,0]]
                break
    return data

[docs]def load_model(fname,model):
    """
    Load saved model's parameter dictionary to initialized model.
    The function will remove any ``.module`` string from parameter's name.

    Parameters
    ----------
    fname : :py:class:`str`
      Path to saved model
    model : :py:class:`torch.nn.Module`
      Initialized network network architecture

    Returns
    -------
    model : :py:class:`torch.nn.Module`
      Up-to-date neural network model
    """
    checkpoint = torch.load(fname,map_location=lambda storage, loc: storage)
    state_dict = checkpoint['model']
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if k.startswith('module.'):
            k = k[7:]
        new_state_dict[k] = v
    checkpoint['model'] = new_state_dict
    model.load_state_dict(checkpoint['model'])
    return model

[docs]def save_data(data, dir_dst, fname):
    """
    Save raw data region as JPG image.

    Parameters
    ----------
    data : :py:class:`numpy.ndarray`
      Input raw data
    dir_dst : :py:class:`str`
      Path to save image
    fname : :py:class:`str`
      Output file name
    """
    img = (data-data.min())/(data.max()-data.min())
    img = Image.fromarray(numpy.uint8(img*255))
    img.save('%s/%s.jpg'%(dir_dst,fname))

[docs]def load_image(data,rgb=False,to_numpy=False,squeeze=False):
    """
    Load input data as tensor image. Input data can be either in the form
    of JPG image or raw data region and will be converted into numpy.uint8
    format, then either RGB or grayscale image.

    Parameters
    ----------
    data : :py:class:`str` or :py:class:`numpy.ndarray`
      Input data, either path saved image, or raw data array.
    rgb : :py:class:`bool`
      Convert data to RGB data image
    to_numpy : :py:class:`bool`
      Convert data to numpy array
    squeeze : :py:class:`bool`
      Squeeze data to remove any dimensions equal to 1.
    """
    if type(data)==str:
        data = os.path.expandvars(data)
        image = Image.open(data)
    else:
        image = (data-data.min())/(data.max()-data.min())
        image = Image.fromarray(numpy.uint8(image*255))
    if rgb:
        image = transforms.ToTensor()(image.convert("RGB")).view(1,3,*image.size)
    else:
        image = transforms.ToTensor()(image.convert("L")).view(1,1,*image.size)
    if squeeze:
        image = torch.squeeze(image)
    if to_numpy:
        image = image.numpy()
    return image

[docs]def load_bulk(dname,size,rgb=False,to_numpy=False,labeled=True):
    """
    Load multiple images from directory.

    Parameters
    ----------
    dname : :py:class:`str`
      Path to directory where images are saved.
    size : :py:class:`int`
      Number of images to be loaded
    rgb : :py:class:`bool`
      Convert data to RGB data image
    to_numpy : :py:class:`bool`
      Convert data to numpy array
    labeled : :py:class:`bool`
      Whether images are saved in a labeled structure (same structure than for 
      :py:class:`torchvision.datasets.ImageFolder` class) or directly in target repository.

    Returns
    -------
    tensors : :py:class:`torch.Tensor` or :py:class:`numpy.ndarray`
      Output list of loaded images either in tensor or numpy array formats.
    labels : :py:class:`torch.Tensor` or :py:class:`numpy.ndarray`
      Output list of labels either in tensor or numpy array formats.
    """
    fname = os.path.expandvars(dname)
    if labeled:
        all_labels = [label for label in os.listdir(fname+'/train/') if label!='.DS_Store']
        assert size%len(all_labels)==0, 'Requested bulk size not multiple of number of labels.'
        labels, tensors = [], []
        for i,label in enumerate(all_labels):
            file_list = glob.glob('%s/*/%s/*.jpg'%(dname,label))
            for fname in random.sample(file_list,size//len(all_labels)):
                tensors.append(load_image(fname,rgb=rgb))
            labels += ([i]*(size//len(all_labels)))
        tensors = torch.stack(tensors).squeeze(dim=1)
        if to_numpy:
            tensors = torch.squeeze(tensors).numpy()
        idxs = random.sample(range(size), size)
        return tensors[idxs], numpy.array(labels)[idxs]
    else:
        tensors = []
        file_list = glob.glob('%s/*.jpg'%dname)
        for fname in random.sample(file_list,size):
            tensors.append(load_image(fname,rgb=rgb))
        tensors = torch.stack(tensors).squeeze(dim=1)
        if to_numpy:
            tensors = torch.squeeze(tensors).numpy()
        return tensors,numpy.array([0]*(size))