TNO Intern

Commit c5fc7577 authored by Florian Knappers's avatar Florian Knappers
Browse files

Merge branch 'jd-fixes' into 'main'

Jd fixes

See merge request !139
parents d06fe5c8 31e51132
Loading
Loading
Loading
Loading
Loading
+82 −83

File changed.

Preview size limit exceeded, changes collapsed.

+0 −72
Original line number Diff line number Diff line
import numpy as np
import xarray as xr

from pathlib import Path


def auto_chunk_dataset(
    dataset_to_chunk: xr.Dataset | xr.DataArray, target_chunk_size: int
@@ -42,73 +40,3 @@ def auto_chunk_dataset(
        current_chunk_size = np.prod(list(chunking.values()))

    return dataset_to_chunk.copy().chunk(chunking)

def assess_optimal_chunk_size(n_simulations: int = 1000, chunk_step_size: int = 100, plot_outfile : str | Path = None):
    """
    This runs the same set of doublet simulations using different chunk sizes and prints the results to the terminal to show find which chunk size is optimal.
    It runs each chunk size 3 times and takes the average of their times to find the time taken for that chunk size.
    The chunks which are tested go from 1 -> n_simulations with steps of chunk_step_size.

    Parameters
    ----------
    n_simulations : int
        The number of simulations which are run in parralel
    chunk_step_size : int
        The step size of chunks to test; going from 1 to n_simulations
    plot_outfile : str | Path
        If provided a plot is made showing the time taken per-chunk size and saved to this path

    """
    reservoir_properties = xr.Dataset(
        {
            "thickness": (["sample"], np.ones(n_simulations) * 200),
            "porosity": (["sample"], np.ones(n_simulations)),
            "ntg": (["sample"], np.ones(n_simulations)),
            "depth": (["sample"], np.ones(n_simulations) * 1000),
            "permeability": (["sample"], np.ones(n_simulations) * 500),
        },
        coords={"sample": np.arange(n_simulations)}
    )

    n_attempts = 3 # do the same operation n_attempts and take an average of their times.
    sample_chunks = np.arange(1, n_simulations + 2, chunk_step_size)

    # run in series
    time_attempt = []
    for attempt in range(n_attempts):
        start = timeit.default_timer()
        simulation_benchmark = calculate_doublet_performance(reservoir_properties)
        time_attempt.append(timeit.default_timer() - start)
    normal_time = np.mean(time_attempt)
    print(f"non-parralel simulation took {normal_time:.1f} seconds, {n_simulations / normal_time:.1f} samples per second")


    # run in parallel:
    mean_time = []
    std_time = []
    for sample_chunk in sample_chunks:
        time_attempt = []
        for attempt in range(n_attempts):
            start = timeit.default_timer()
            simulations_parallel = calculate_doublet_performance(reservoir_properties, chunk_size=sample_chunk, print_execution_duration=False)
            time_attempt.append(timeit.default_timer() - start)

            # additional check that the results are identical
            xr.testing.assert_allclose(simulation_benchmark, simulations_parallel)
            xr.testing.assert_equal(simulation_benchmark, simulations_parallel)

        mean_time.append(np.mean(time_attempt))
        std_time.append(np.std(time_attempt))
        print(f"parralel simulation, chunk size: {sample_chunk}, took {np.mean(time_attempt):.1f} seconds to run {n_simulations} simulations, {n_simulations / mean_time[-1]:.1f} samples per second")

    if plot_outfile is None:
        return

    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    ax.errorbar(sample_chunks, mean_time, yerr=std_time, fmt='o', capsize=5, label='parralel simulation')
    ax.axhline(normal_time, label="non-parralel simulation", color="tab:orange", linestyle="--")
    ax.set_xlabel("chunk size")
    ax.set_ylabel("time (s)")
    ax.set_title("Chunk size assessment")
    ax.legend()
    plt.savefig(plot_outfile)
 No newline at end of file
+6 −0
Original line number Diff line number Diff line
@@ -87,6 +87,9 @@ class ThermoGISDoubletResults(BaseModel):
    transmissivity: FloatOrArray
    permeability: FloatOrArray | None
    temperature: FloatOrArray
    ntg: FloatOrArray
    porosity: FloatOrArray
    depth: FloatOrArray

    def to_dataset(self) -> xr.Dataset:
        """Convert results to an :class:`xarray.Dataset`.
@@ -271,6 +274,9 @@ class ThermoGISDoublet:
            transmissivity=transmissivity,
            permeability=permeability,
            temperature=self.aquifer.temperature,
            porosity=self.aquifer.porosity,
            ntg=self.aquifer.ntg,
            depth=self.aquifer.depth,
        )

    def _resolve_stochastic_properties(