From 46c9e3af33dbb75b372552d77ca8750640f0dd13 Mon Sep 17 00:00:00 2001 From: Arjo Segers Date: Wed, 30 Apr 2025 16:35:38 +0200 Subject: [PATCH 1/2] Fixes for colocated output. --- src/cso/cso_colocate.py | 154 ++++++++++++++++++++++------------------ src/cso/cso_file.py | 18 ++++- 2 files changed, 100 insertions(+), 72 deletions(-) diff --git a/src/cso/cso_colocate.py b/src/cso/cso_colocate.py index b53a782..d76b687 100644 --- a/src/cso/cso_colocate.py +++ b/src/cso/cso_colocate.py @@ -7,6 +7,10 @@ # 2023, Lewis Blake, Arjo Segers # Formatted using "black". # +# 2025-04, Arjo Segers +# Added option to drop some columns of the location csv after reading, +# this prevents these columns from being added to the co-location output. +# ######################################################################## @@ -73,7 +77,7 @@ class CSO_CoLocate(utopya.UtopyaRc): surface locations. The value assigned to location and time is an average over all pixels within a specified distance and valid for the time. - Location should be specified in a csv file with columns at least longitude and latiude:: + Location should be specified in a csv file with at least columns that specify the longitude and latiude:: name ;longitude;latitude Peyrusse Vieille ; 0.18; 43.62 @@ -92,12 +96,19 @@ class CSO_CoLocate(utopya.UtopyaRc): .locations.sep : ; .locations.comment : # - Also specify the column names that used for the longitude and latitude:: + Also specify the column names that are used for the longitude and latitude:: ! column names: .locations.longitude : longitude .locations.latiutde : latiutde + The content of the csv file is included in the output file as 1D variables with dimension ``(location,)``. + These variables have the name of the column in the csv file, preceeded by ``location_``. + Eventually specify that some columns should be skipped:: + + ! skip auxilary columns: + .locations.skip_columns : flag value + Time series are created within a time range:: ! time range: @@ -198,9 +209,9 @@ class CSO_CoLocate(utopya.UtopyaRc): utopya.UtopyaRc.__init__(self, rcfile, rcbase=rcbase, env=env) # info ... - logging.info(indent + "") - logging.info(indent + "** Co-located CSO data") - logging.info(indent + "") + logging.info(f"{indent}") + logging.info(f"{indent}** Co-located CSO data") + logging.info(f"{indent}") # time range: t1 = self.GetSetting("timerange.start", totype="datetime") @@ -208,12 +219,12 @@ class CSO_CoLocate(utopya.UtopyaRc): # info ... tfmt = "%Y-%m-%d %H:%M" - logging.info(indent + "timerange: [%s,%s]" % (t1.strftime(tfmt), t2.strftime(tfmt))) + logging.info(f"{indent}timerange: [{t1.strftime(tfmt)},{t2.strftime(tfmt)}]") # output time step: output_freq = self.GetSetting("output.freq") # info ... - logging.info(indent + "output freqency: %s" % output_freq) + logging.info(f"{indent}output freqency: {output_freq}") # output time intervals: if output_freq in ["D", "day", "daily"]: outtimes1 = pandas.date_range(t1, t2, freq="D") @@ -221,20 +232,23 @@ class CSO_CoLocate(utopya.UtopyaRc): elif output_freq in ["M", "month", "monthly"]: outtimes1 = pandas.date_range(t1, t2, freq="MS") outtimes2 = outtimes1 + pandas.DateOffset(months=1, seconds=-1) + elif output_freq in ["Y", "year", "yearly"]: + outtimes1 = pandas.date_range(t1, t2, freq="YS") + outtimes2 = pandas.date_range(t1, t2, freq="Y") + pandas.DateOffset(days=1, seconds=-1) else: - logging.error('unsupported output time step "%s"' % step) + logging.error(f"unsupported output time step '{step}'") raise Exception # endif # target file: outfile_template = self.GetSetting("output.file") # info ... - logging.info(indent + "target files: %s" % outfile_template) + logging.info(f"{indent}target files: {outfile_template}") # renew? renew = self.GetSetting("renew", totype="bool") # info ... - logging.info(indent + "renew output: %s" % renew) + logging.info(f"{indent}renew output: {renew}") # pack floats as shorts? packed = self.GetSetting("output.packed", "bool") @@ -245,21 +259,21 @@ class CSO_CoLocate(utopya.UtopyaRc): gattrs = {} attrnames = self.GetSetting("output.attrs").split() for key in attrnames: - gattrs[key] = self.GetSetting("output.attr.%s" % (key)) + gattrs[key] = self.GetSetting(f"output.attr.{key}") # endfor # station file: locations_file = self.GetSetting("locations.file") # info ... - logging.info(indent + "location file: %s" % locations_file) + logging.info(f"{indent}location file: {locations_file}") # check .. if not os.path.isfile(locations_file): - logging.error("file not found: %s" % locations_file) + logging.error(f"file not found: {locations_file}") raise Exception # endif # special characters: sep = self.GetSetting("locations.sep", default=";") - comment = self.GetSetting("locations.sep", comment="#") + comment = self.GetSetting("locations.sep", default="#") # column names: loc_lon = self.GetSetting("locations.longitude") loc_lat = self.GetSetting("locations.latitude") @@ -270,6 +284,16 @@ class CSO_CoLocate(utopya.UtopyaRc): # count: nloc = len(locations) + # skip some colunns from the file? + skip_columns = self.GetSetting("locations.skip_columns", default="").split() + # defined? + if len(skip_columns) > 0: + # info ... + logging.info(f"{indent}skip columns: {skip_columns}") + # remove columns: + locations = locations.drop(columns=skip_columns) + # endif + # time resolution resolution = self.GetSetting("resolution") @@ -284,7 +308,7 @@ class CSO_CoLocate(utopya.UtopyaRc): for stype in stypes: # tempate for input file: sources[stype] = {} - sources[stype]["input"] = self.GetSetting("source.%s" % stype) + sources[stype]["input"] = self.GetSetting(f"source.{stype}") # endfor # first source type, this should have the footprints: @@ -296,7 +320,7 @@ class CSO_CoLocate(utopya.UtopyaRc): varsources = {} for vkey in vkeys: # source description: - varsources[vkey] = self.GetSetting("output.%s.source" % (vkey)) + varsources[vkey] = self.GetSetting(f"output.{vkey}.source") # endfor # loop over times for output files: @@ -309,13 +333,13 @@ class CSO_CoLocate(utopya.UtopyaRc): # create? if (not os.path.isfile(outfile)) or renew: # info ... - logging.info(indent + "create %s .." % outfile) + logging.info(f"{indent}create {outfile} ..") # averaging time intervals: if resolution == "hourly": # start and end times: tt1 = pandas.date_range(outtime1, outtime2, freq="H") - tt2 = t1 + pandas.Timedelta(hours=1) + tt2 = tt1 + pandas.Timedelta(hours=1) # elif resolution == "daily": # start and end times: @@ -323,7 +347,7 @@ class CSO_CoLocate(utopya.UtopyaRc): tt2 = tt1 + pandas.Timedelta(days=1) # else: - logging.error('unsupported resolution "%s"' % resolution) + logging.error(f"unsupported resolution '{resolution}'") raise Exception # endif # mid: @@ -332,7 +356,7 @@ class CSO_CoLocate(utopya.UtopyaRc): nt = len(ttm) # info ... - logging.info(indent + "init output file ...") + logging.info(f"{indent}init output file ...") # init output file: outf = cso_file.CSO_File() # locations: @@ -349,13 +373,13 @@ class CSO_CoLocate(utopya.UtopyaRc): # store location info: for key in locations.keys(): # variable name: - vname = "location_%s" % key + vname = f"location_{key}" # add data: outf.AddDataVariable( vname, ("location",), values=locations[key], - attrs={"long_name": "location %s" % key, "units": "1"}, + attrs={"long_name": f"location {key}", "units": "1"}, ) # endfor # storage for number of selected pixels: @@ -364,7 +388,7 @@ class CSO_CoLocate(utopya.UtopyaRc): ("time", "location"), values=numpy.zeros((nt, nloc), dtype="i2"), attrs={ - "long_name": "number of pixels within %i km" % distance_km, + "long_name": f"number of pixels within {distance_km} km", "units": "1", }, ) @@ -372,10 +396,7 @@ class CSO_CoLocate(utopya.UtopyaRc): outf_setup = False # init history: - line = "averages over pixels with %i km from locations specified in %s;" % ( - distance_km, - locations_file, - ) + line = f"averages over pixels within {distance_km} km from locations specified in {locations_file};" line = line + "data files:" for stype in stypes: line = line + " " + sources[stype]["input"] @@ -386,10 +407,8 @@ class CSO_CoLocate(utopya.UtopyaRc): # assume input has hourly timestamps .. times = pandas.date_range(outtime1, outtime2, freq="H") # info ... - logging.info( - indent + "search for input files in [%s,%s] .." % (outtime1, outtime2) - ) - logging.info(indent + " search times: %s, .., %s" % (times[0], times[-1])) + logging.info(f"{indent}search for input files in [{outtime1},{outtime2}] ..") + logging.info(f"{indent} search times: {times[0]}, .., {times[-1]}") # list of processed files: donefiles = [] # loop over input times: @@ -404,41 +423,41 @@ class CSO_CoLocate(utopya.UtopyaRc): continue # info ... - logging.info(indent + " read input file(s) ...") + logging.info(f"{indent} read input file(s) ...") # loop: for stype in stypes: # input file: infile = time.strftime(sources[stype]["input"]) # info .. - logging.info(indent + " %s" % infile) + logging.info(f"{indent} {infile}") # read: sources[stype]["file"] = cso_file.CSO_File(filename=infile) ## update history: - # history.insert( 0, 'pixel data from %s' % os.path.basename(infile) ) + # history.insert( 0, f"pixel data from {os.path.basename(infile)}") # endfor # filter: selected, history = sources[stype0]["file"].SelectPixels( - self.rcf, rcbase, indent=indent + " " + self.rcf, rcbase, indent=f"{indent} " ) # count: npix = selected.size nsel = selected.sum() # none? if nsel == 0: - loggin.info(indent + " no pixels selected ..") + loggin.info(f"{indent} no pixels selected ..") continue # endif # info .. - logging.info(indent + " selected %i / %i pixels .." % (nsel, npix)) + logging.info(f"{indent} selected {nsel} / {npix} pixels ..") # target record: - (ii,) = numpy.where((time >= tt1) & (time < tt2)) + (ii,) = numpy.where((time > tt1) & (time <= tt2)) # check .. if len(ii) != 1: - logging.error("time %s found %i times within bounds:" % (time, len(ii))) + logging.error("time {time} found {len(ii)} times within bounds:") for it in range(len(tt1)): - logging.error("[%s,%s]" % (t1[it], t2[it])) + logging.error("[{t1[it]},{t2[it]}]") # endfor raise Exception # endif @@ -446,7 +465,7 @@ class CSO_CoLocate(utopya.UtopyaRc): itime = ii[0] # info .. logging.info( - indent + " target time interval: [%s,%s)" % (tt1[itime], tt2[itime]) + f"{indent} target time interval: [{tt1[itime]},{tt2[itime]})" ) # storage: @@ -462,8 +481,7 @@ class CSO_CoLocate(utopya.UtopyaRc): shp = das_in[vkey].shape[1:] if numpy.prod(shp) != 1: logging.error( - 'source variable "%s" has shape %s per pixel, but currently only scalars supported ...' - % (vkey, str(shp)) + f"source variable '{vkey}' has shape {str(shp)} per pixel, but currently only scalars supported ..." ) raise Exception # endif @@ -483,7 +501,7 @@ class CSO_CoLocate(utopya.UtopyaRc): # endif # init output? # info .. - logging.info(indent + " loop over locations ...") + logging.info(f"{indent} loop over locations ...") # loop over locations: for iloc in range(nloc): # index: @@ -492,7 +510,7 @@ class CSO_CoLocate(utopya.UtopyaRc): lon = locations.at[indx, loc_lon] lat = locations.at[indx, loc_lat] # info ... - logging.info(indent + " location % (%f,%f)" % (iloc + 1, lon, lat)) + logging.info(f"{indent} location {iloc+1} ({lon:7.2f},{lat:6.2f})") # distance in m from pixels to location: dist_m = cso_mapping.LonLatDistance( lon, @@ -505,15 +523,13 @@ class CSO_CoLocate(utopya.UtopyaRc): # any? if len(iipix) == 0: # info ... - logging.info(indent + " no pixels within %i km" % distance_km) + logging.info(f"{indent} no pixels within {distance_km} km") else: # count: nval = len(iipix) # info ... logging.info( - indent - + " found %i selected pixels within %i km" - % (nval, distance_km) + f"{indent} found {nval} selected pixels within {distance_km} km" ) # add contribution: outf.ds["npixel"].values[itime, iloc] += nval @@ -534,7 +550,7 @@ class CSO_CoLocate(utopya.UtopyaRc): # endfor # input times # info ... - logging.info(indent + " averages ...") + logging.info(f"{indent} averages ...") # times/locations with pixels found: kk, ii = numpy.where(outf.ds["npixel"].values > 0) # any? @@ -547,7 +563,7 @@ class CSO_CoLocate(utopya.UtopyaRc): # endif # any pixels co-located # info ... - logging.info(indent + " write ...") + logging.info(f"{indent} write ...") # write: outf.Write( outfile, @@ -560,7 +576,7 @@ class CSO_CoLocate(utopya.UtopyaRc): else: # info ... - logging.info(indent + "keep %s .." % outfile) + logging.info(f"{indent}keep {outfile} ...") # endif # renew # endfor # outtimes @@ -568,9 +584,9 @@ class CSO_CoLocate(utopya.UtopyaRc): # endfor # time loop # info ... - logging.info(indent + "") - logging.info(indent + "** end") - logging.info(indent + "") + logging.info(f"{indent}") + logging.info(f"{indent}** end") + logging.info(f"{indent}") # enddef __init__ @@ -690,9 +706,9 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase): cso_catalogue.CSO_CatalogueBase.__init__(self, rcfile, rcbase=rcbase, env=env) # info ... - logging.info(indent + "") - logging.info(indent + "** Plot Co-located time series") - logging.info(indent + "") + logging.info(f"{indent}") + logging.info(f"{indent}** Plot Co-located time series") + logging.info(f"{indent}") # time range: t1 = self.GetSetting("timerange.start", totype="datetime") @@ -700,12 +716,12 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase): # info ... tfmt = "%Y-%m-%d %H:%M" - logging.info(indent + "timerange: [%s,%s]" % (t1.strftime(tfmt), t2.strftime(tfmt))) + logging.info(f"{indent}timerange: [%s,%s]" % (t1.strftime(tfmt), t2.strftime(tfmt))) # renew? renew = self.GetSetting("renew", totype="bool") # info ... - logging.info(indent + "renew output: %s" % renew) + logging.info(f"{indent}renew output: %s" % renew) # filter for input files: fpattern = self.GetSetting("input.files") @@ -771,13 +787,13 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase): styles[varname] = {} for resolution in resolutions: styles[varname][resolution] = self.GetSetting( - "var.%s.%s.style" % (varname, resolution), totype="dict" + f"var.{varname}.{resolution}.style", totype="dict" ) # endfor # endfor # info ... - logging.info(indent + "resample ...") + logging.info(f"{indent}resample ...") # resample (all locations): # das['N']['yr' ] = .. # das['N']['times'] = .. @@ -832,16 +848,16 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase): # endfor # time resols # loop over locations: - for iloc in range(ds.dims["location"]): + for iloc in range(ds.sizes["location"]): # info .. - logging.info("location %i / %i .." % (iloc + 1, ds.dims["location"])) + logging.info(f"{indent}location {iloc+1} / {ds.sizes['location']} ...") # target file, replace templates: figfile = figfile_template.format(**loci[iloc]) # renew? if (not os.path.isfile(figfile)) or renew: # info .. - logging.info(" create %s .." % figfile) + logging.info(f"{indent} create {figfile} ...") # new: fig = plt.figure(figsize=figsize) @@ -896,15 +912,15 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase): else: # info ... - logging.info(" keep %s .." % figfile) + logging.info(f"{indent} keep {figfile} ...") # endif # endfor # locations # info ... - logging.info(indent + "") - logging.info(indent + "** end") - logging.info(indent + "") + logging.info(f"{indent}") + logging.info(f"{indent}** end") + logging.info(f"{indent}") # enddef __init__ diff --git a/src/cso/cso_file.py b/src/cso/cso_file.py index 09da2bb..22a69cb 100644 --- a/src/cso/cso_file.py +++ b/src/cso/cso_file.py @@ -36,6 +36,9 @@ # - added GetValue method # Removed usage of undefined filename in error messages. # +# 2025-04, Arjo Segers +# Enable zlib compression only for numerical data. +# ######################################################################## ### @@ -606,9 +609,18 @@ class CSO_File(object): if complevel > 0: # loop over variables: for vname in self.ds.keys(): - # enable zlib compression and set level: - self.ds[vname].encoding["zlib"] = True - self.ds[vname].encoding["complevel"] = complevel + # only numerical values: + if self.ds[vname].dtype in [ + numpy.int16, + numpy.int32, + numpy.int64, + numpy.float32, + numpy.float64, + ]: + # enable zlib compression and set level: + self.ds[vname].encoding["zlib"] = True + self.ds[vname].encoding["complevel"] = complevel + # endif # endfor # variables # endif # comppress -- GitLab From 9ae526029718f432c14fdc6ea3901ccb5344863b Mon Sep 17 00:00:00 2001 From: Arjo Segers Date: Thu, 1 May 2025 10:39:33 +0200 Subject: [PATCH 2/2] Improved check on arrays being numeric. Avoid warnings from packing in case of all-nan values. --- src/cso/cso_file.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/cso/cso_file.py b/src/cso/cso_file.py index 22a69cb..6cd93a7 100644 --- a/src/cso/cso_file.py +++ b/src/cso/cso_file.py @@ -38,6 +38,7 @@ # # 2025-04, Arjo Segers # Enable zlib compression only for numerical data. +# Avoid warnings from packing in case of all-nan values. # ######################################################################## @@ -188,9 +189,16 @@ def Pack_DataArray(da, dtype="i2"): # only floats ... if da.dtype in [numpy.float32, numpy.float64]: - # value range, ignore nan's: - vmin = numpy.nanmin(da.values) - vmax = numpy.nanmax(da.values) + # should have some values .. + if numpy.any( ~ numpy.isnan(da.values) ): + # value range, ignore nan's: + vmin = numpy.nanmin(da.values) + vmax = numpy.nanmax(da.values) + else: + # dummy range: + vmin = 0.0 + vmax = 0.0 + #end if # target data type could be integer or float: if dtype.startswith("i"): # use absolute minimum to represent nans: @@ -610,13 +618,8 @@ class CSO_File(object): # loop over variables: for vname in self.ds.keys(): # only numerical values: - if self.ds[vname].dtype in [ - numpy.int16, - numpy.int32, - numpy.int64, - numpy.float32, - numpy.float64, - ]: + dtype = self.ds[vname].dtype + if numpy.issubdtype(dtype,numpy.integer) or numpy.issubdtype(dtype,numpy.floating): # enable zlib compression and set level: self.ds[vname].encoding["zlib"] = True self.ds[vname].encoding["complevel"] = complevel -- GitLab