Source code for resqpy.olio.load_data

"""Functions to load data from various ASCII simulator file formats."""

# Nexus is a registered trademark of the Halliburton Company

import logging

log = logging.getLogger(__name__)

import os
import numpy as np

import resqpy.olio.ab_toolbox as abt
import resqpy.olio.box_utilities as box
import resqpy.olio.grid_functions as gf
import resqpy.olio.keyword_files as kf
import resqpy.olio.write_data as wd


[docs]def file_exists(file_name, must_be_more_recent_than_file = None): """Returns True if the file exists (and is more recent than other file, if given).""" if file_name is None or len(file_name) == 0: return False existe = os.path.exists(file_name) if not existe: return False if not must_be_more_recent_than_file or file_name == must_be_more_recent_than_file or \ not os.path.exists(must_be_more_recent_than_file): return True return os.path.getmtime(file_name) > os.path.getmtime(must_be_more_recent_than_file)
###################################################################################################### # load_corp_array_from_file(): # function to load a nexus corp array of data from a named file # if the grid extent is not known, the file must be free of comments # NB: extent_kji here is extent of grid, rather than that of the corp array # returns a pagoda style 7D array, resequenced
[docs]def load_corp_array_from_file(file_name, extent_kji = None, corp_bin = False, swap_bytes = True, max_lines_for_keyword = 100, comment_char = None, data_free_of_comments = False, use_binary = False, eight_mode = False, use_numbers_only = None): """Loads a nexus corner point (CORP) array from a file, returns a 7D numpy array in pagoda ordering. arguments: file_name: The name of an ascii file holding the CORP data (no other keywords with numeric data should be in the file); write access to the directory is likely to be needed if use_binary is True extent_kji: The extent of the grid as a list or a 3 element numpy array, in the order [NK, NJ, NI]. If extent_kji is None, the extent is figured out from the data. It must be given for 1D or 2D models corp_bin (boolean, default False): if True, input file is in bespoke corp binary format, otherwise ascii swap_bytes (boolean, default True): if True, byte ordering of corp bin data is reversed; only relevant if corp_bin is True max_lines_for_keyword: the maximum number of lines to search for CORP keyword; set to zero if file is known to be data only comment_char: A single character string which is interpreted as introducing a comment data_free_of_comments: If True, once the numeric data is encountered, it is assumed that there are no further comments (allowing a faster load) use_binary: If True, a more recent file containing a pure binary copy of the data is looked for first, in the same directory; if found, the data is loaded directly from that file; if not found, the binary file is created after the ascii has been loaded (ready for next time) eight_mode: If True, the data is assumed to be in CORP EIGHT ordering; otherwise the normal ordering (The code does not look for keywords.); this is not automatically determined from any keyword in the file use_numbers_only: no longer in use, ignored returns: A numpy array containing the CORP data in 7D pagoda protocol ordering. The extent of the grid, and hence shape of the array is determined from the corner point data unless extent_kji has been specified. """ # assert(extent_kji is not None or data_free_of_comments) # no longer a requirement if extent_kji is None: extent = None else: extent = [extent_kji[0] * extent_kji[1] * extent_kji[2], 8, 3] if corp_bin: bin_file_size = os.path.getsize(file_name) bin_cell_count, remainder = divmod(bin_file_size, 26 * 4) # corp bin format has records of 24 + 2 (head, tail) 32 bit words if remainder: log.error('corp binary file ' + str(file_name) + ' is not a whole number of records') return None dt = np.dtype('float32') if swap_bytes: dt = dt.newbyteorder() cp_array = np.fromfile(file_name, dtype = dt, count = 26 * bin_cell_count).reshape((-1, 26))[:, 1:25] if extent_kji is not None: if bin_cell_count != (extent_kji[0] * extent_kji[1] * extent_kji[2]): log.error('corp binary file ' + str(file_name) + ' contains data for ' + str(bin_cell_count) + ' cells; extent requires ' + str(extent_kji[0] * extent_kji[1] * extent_kji[2])) else: cp_array = load_array_from_file(file_name, extent = extent, data_type = 'real', keyword = 'CORP', max_lines_for_keyword = max_lines_for_keyword, comment_char = comment_char, data_free_of_comments = data_free_of_comments, use_binary = use_binary) cell_count, remainder = divmod(cp_array.size, 24) if remainder: log.error('file ' + file_name + ' contains ' + str(cp_array.size) + ' data, which is not a multiple of 24') return None cp_array = cp_array.reshape(1, 1, cell_count, 2, 2, 2, 3) # pagoda 7D, temporarily with all cells on I axis log.debug('resequencing corner point data') gf.resequence_nexus_corp(cp_array, eight_mode = eight_mode) # switches IP points where JP = 1 (unless eight_mode) if extent_kji is None: log.info('determining grid extent from corner points') extent_kji = gf.determine_corp_extent(cp_array) if extent_kji is None: log.error('failed to determine extent of grid from corner points') return cp_array return cp_array.reshape(extent_kji[0], extent_kji[1], extent_kji[2], 2, 2, 2, 3)
###################################################################################################### # load_array_from_file(): # function to load an array of data from a named file # if the extent is not known (None), the file must be free of comments # a new numpy array is returned # binary data file creation & reuse is supported
[docs]def load_array_from_file(file_name, extent = None, data_type = 'real', keyword = None, max_lines_for_keyword = None, comment_char = None, data_free_of_comments = False, use_binary = False, use_numbers_only = None): """Load an array from an ascii (or pure binary) file. Arguments are similar to those for load_corp_array_from_file(). """ if not use_binary: return load_array_from_ascii_file(file_name, extent = extent, data_type = data_type, keyword = keyword, max_lines_for_keyword = max_lines_for_keyword, comment_char = comment_char, data_free_of_comments = data_free_of_comments) (extension, ab_type) = abt.binary_file_extension_and_np_type_for_data_type(data_type) if extent is None: cell_count = -1 # np.fromfile interprets this as 'read everything' log.debug('Loading unknown number of array data elements from file ' + file_name) else: cell_count = np.prod(extent) log.debug('Loading %1d array data elements from file %s', cell_count, file_name) ascii_file_name = file_name binary_file_name = file_name if len(binary_file_name) < 4 or binary_file_name[-3:] != extension: binary_file_name += extension else: ascii_file_name = ascii_file_name[:-3] # strip off '.db' or similar try: # tentatively try to read data from an existing binary file, if present if file_exists(binary_file_name, must_be_more_recent_than_file = ascii_file_name): with open(binary_file_name, 'rb') as binary_file_in: result = np.fromfile(binary_file_in, dtype = ab_type, count = cell_count) if extent is not None: result = result.reshape(extent) # check that end of file has been reached, ie. not too much data in file try: # expected to return null c = binary_file_in.read(1) if len(c): log.warning('binary file contains more data than expected: ' + binary_file_name) except Exception: pass log.info('Data loaded from binary file %s', binary_file_name) return result except Exception: pass # read from ascii file result = load_array_from_ascii_file(file_name, extent = extent, data_type = data_type, keyword = keyword, max_lines_for_keyword = max_lines_for_keyword, comment_char = comment_char, data_free_of_comments = data_free_of_comments) # create a binary file (to be used next time) try: wd.write_pure_binary_data(binary_file_name, result) except Exception: log.warn('Failed to write data to binary file %s', binary_file_name) # todo: could delete the binary file in case a corrupt file is left for use next time return result
# end of load_array_from_file() def ###################################################################################################### ###################################################################################################### # load_array_from_ascii_file(): # function to load an array of data from a named ascii file # if the extent is not known (None), the file must be free of comments # a new numpy array is returned
[docs]def load_array_from_ascii_file(file_name, extent = None, data_type = 'real', keyword = None, max_lines_for_keyword = None, comment_char = None, data_free_of_comments = False, skip_c_space = True, use_numbers_only = None): """Returns a numpy array with data loaded from an ascii file. arguments: file_name: string holding name of the existing ascii data file, eg. 'Cell_depth.dat' extent: a python list of integers specifying the extent (shape) of the array, eg. [148, 270, 103]; if None, all data is read and returned as a 'flat' 1D array (data must be free from comments) data_type: the type of individual data elements, one of 'real','float','int', 'integer', 'bool' or 'boolean' keyword: if present, an attempt is made to find the keyword before reading data, if keyword is None or is not found, data is read from the start of the file max_lines_for_keyword: can be used to limit the search for keyword (for speed efficiency) comment_char: a single character string being the character used to introduce a comment in the file data_free_of_comments: if set to True, a faster load is used once any header line comments have been skipped skip_c_space: if True then a line starting 'C' followed by white space is skipped as a comment use_numbers_only: this argument is no longer in use and is ignored returns: a numpy array of shape specified in extent argument with dtype matching data_type example call: depth_array = load_data.load_array_from_ascii_file('Cell_depth.dat', [148, 270, 103]) notes: In all use cases, this function is designed to load a single array of data from an ascii file that DOES NOT CONTAIN OTHER ARRAYS as well, ie. data for a single simulation keyword in the file. If skip_c_space is True, lines starting 'C ' are also treated as comments. If data_free_of_comments is True, there must be at least one blank line before the data begins, and no further comments are permitted. (This format is designed to handle data files generated by a commonly used geomodelling package.) Repeat counts must not be present in the ascii data. The extent, if present, can contain any number of dimensions, typically 3 for reservoir modelling work. The total number of numbers in the file must match the number of elements in the given extent (ie. the product of the list of numbers in the extent argument). The order of indices in extent should be 'slowest changing' first, eg.: k,j,i The data_type defaults to 'real' 'real' and 'float' are synonymous; 'int' and 'integer' are synonymous; 'bool' and 'boolean' are synonymous; default is 'real' The numpy data type will be the default 64 bit float or 64 bit int """ # todo: Code enhancement could cater for 32 bit options (and 8 bit for bool) if needed to reduce memory usage if extent is None: cell_count = -1 # np.fromfile interprets this as 'read everything' log.debug('Loading unknown number of array data elements from ascii file ' + file_name) else: cell_count = np.prod(extent) log.debug('Loading %1d array data elements from ascii file %s', cell_count, file_name) if data_type in ['real', 'float', float]: d_type = 'float' elif data_type in ['int', 'integer', int]: d_type = 'int' elif data_type in ['bool', 'boolean', bool]: d_type = 'int' # read booleans as 0 or 1 and convert after else: assert False, 'Unknown data_type passed to load_array_from_ascii_file' + str(data_type) read_file_name = file_name with open(read_file_name, 'r') as data_file: if not comment_char and not data_free_of_comments: comment_char = kf.guess_comment_char(data_file) if not comment_char: comment_char = '!' if keyword: keyword_found = kf.find_keyword(data_file, keyword, max_lines = max_lines_for_keyword) if keyword_found: data_file.readline() # skip keyword line kf.skip_blank_lines_and_comments(data_file, comment_char = comment_char, skip_c_space = skip_c_space) result = None if data_free_of_comments: # use numpy fromfile function after passing header comments result = np.fromfile(data_file, dtype = d_type, count = cell_count, sep = ' ') if result is None: data_file.seek(0) if result is None: # builds one very big string, stripping trailing comments # s = '' # while True: # r = data_file.readline() # if len(r) == 0: break # s += r.partition(comment_char)[0] + ' ' # result = np.fromstring(s, dtype = d_type, count = cell_count, sep = ' ') # # note: extra data will go unnoticed if cell count known! # del s b = bytearray(data_file.read().encode()) bc = comment_char.encode() nl = b'\n' sp = b' ' i = 0 while True: i = b.find(bc, i) if i < 0: break eol = b.find(nl, i) if eol < 0: eol = len(b) b[i:eol] = sp * (eol - i) result = np.fromstring(b.decode(), dtype = d_type, count = cell_count, sep = ' ') # note: extra data will go unnoticed if cell count known! del b if result is None: assert (extent is not None) # TODO: remove this restriction by dynamically extending a flat array view_1D = np.zeros(extent, dtype = d_type).flatten() elements = view_1D.size start_of_line = True for index in range(elements): while True: ch = data_file.read(1) assert (ch != '') # premature end of file if ch == comment_char: data_file.readline() start_of_line = True continue if skip_c_space and start_of_line and ch in ['C', 'c']: next_ch = data_file.read(1) if next_ch in ' \t\n': if next_ch != '\n': data_file.readline() continue ch += next_ch break if ch not in ' \t\n': break start_of_line = False word = ch while True: ch = data_file.read(1) if ch == '': # end of file (this assumes a whitespace character after last datum) log.error('Not enough data in file %s: %1d of %1d numbers read', file_name, index, elements) assert False, 'not enough data in file' if ch in ' \t\n': break word += ch if ch == '\n': start_of_line = True if d_type == 'float': view_1D[index] = float(word) else: view_1D[index] = int(word) result = view_1D kf.skip_blank_lines_and_comments(data_file, comment_char = comment_char, skip_c_space = skip_c_space) if not kf.end_of_file(data_file): assert False, 'too much data in ascii file: ' + file_name if result is not None and extent is not None: result = result.reshape(extent) if data_type in ['bool', 'boolean', bool]: return result != 0 # convert to boolean else: return result
# end of load_array_from_ascii_file() def ######################################################################################################