From 46c9e3af33dbb75b372552d77ca8750640f0dd13 Mon Sep 17 00:00:00 2001
From: Arjo Segers <Arjo.Segers@met.no>
Date: Wed, 30 Apr 2025 16:35:38 +0200
Subject: [PATCH 1/2] Fixes for colocated output.

---
 src/cso/cso_colocate.py | 154 ++++++++++++++++++++++------------------
 src/cso/cso_file.py     |  18 ++++-
 2 files changed, 100 insertions(+), 72 deletions(-)
diff --git a/src/cso/cso_colocate.py b/src/cso/cso_colocate.py
index b53a782..d76b687 100644
--- a/src/cso/cso_colocate.py
+++ b/src/cso/cso_colocate.py
@@ -7,6 +7,10 @@
 # 2023, Lewis Blake, Arjo Segers
 #   Formatted using "black".
 #
+# 2025-04, Arjo Segers
+#   Added option to drop some columns of the location csv after reading,
+#   this prevents these columns from being added to the co-location output.
+#   
 
 
 ########################################################################
@@ -73,7 +77,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
     surface locations. The value assigned to location and time is an average over all pixels
     within a specified distance and valid for the time.
 
-    Location should be specified in a csv file with columns at least longitude and latiude::
+    Location should be specified in a csv file with at least columns that specify the longitude and latiude::
 
       name                   ;longitude;latitude
       Peyrusse Vieille       ;     0.18;   43.62
@@ -92,12 +96,19 @@ class CSO_CoLocate(utopya.UtopyaRc):
         <rcbase>.locations.sep            :  ;
         <rcbase>.locations.comment        :  #
 
-    Also specify the column names that used for the longitude and latitude::
+    Also specify the column names that are used for the longitude and latitude::
 
         ! column names:
         <rcbase>.locations.longitude      :  longitude
         <rcbase>.locations.latiutde       :  latiutde
 
+    The content of the csv file is included in the output file as 1D variables with dimension ``(location,)``.
+    These variables have the name of the column in the csv file, preceeded by ``location_``.
+    Eventually specify that some columns should be skipped::
+
+        ! skip auxilary columns:
+        <rcbase>.locations.skip_columns  :  flag value
+
     Time series are created within a time range::
 
       ! time range:
@@ -198,9 +209,9 @@ class CSO_CoLocate(utopya.UtopyaRc):
         utopya.UtopyaRc.__init__(self, rcfile, rcbase=rcbase, env=env)
 
         # info ...
-        logging.info(indent + "")
-        logging.info(indent + "** Co-located CSO data")
-        logging.info(indent + "")
+        logging.info(f"{indent}")
+        logging.info(f"{indent}** Co-located CSO data")
+        logging.info(f"{indent}")
 
         # time range:
         t1 = self.GetSetting("timerange.start", totype="datetime")
@@ -208,12 +219,12 @@ class CSO_CoLocate(utopya.UtopyaRc):
 
         # info ...
         tfmt = "%Y-%m-%d %H:%M"
-        logging.info(indent + "timerange: [%s,%s]" % (t1.strftime(tfmt), t2.strftime(tfmt)))
+        logging.info(f"{indent}timerange: [{t1.strftime(tfmt)},{t2.strftime(tfmt)}]")
 
         # output time step:
         output_freq = self.GetSetting("output.freq")
         # info ...
-        logging.info(indent + "output freqency: %s" % output_freq)
+        logging.info(f"{indent}output freqency: {output_freq}")
         # output time intervals:
         if output_freq in ["D", "day", "daily"]:
             outtimes1 = pandas.date_range(t1, t2, freq="D")
@@ -221,20 +232,23 @@ class CSO_CoLocate(utopya.UtopyaRc):
         elif output_freq in ["M", "month", "monthly"]:
             outtimes1 = pandas.date_range(t1, t2, freq="MS")
             outtimes2 = outtimes1 + pandas.DateOffset(months=1, seconds=-1)
+        elif output_freq in ["Y", "year", "yearly"]:
+            outtimes1 = pandas.date_range(t1, t2, freq="YS")
+            outtimes2 = pandas.date_range(t1, t2, freq="Y") + pandas.DateOffset(days=1, seconds=-1)
         else:
-            logging.error('unsupported output time step "%s"' % step)
+            logging.error(f"unsupported output time step '{step}'")
             raise Exception
         # endif
 
         # target file:
         outfile_template = self.GetSetting("output.file")
         # info ...
-        logging.info(indent + "target files: %s" % outfile_template)
+        logging.info(f"{indent}target files: {outfile_template}")
 
         # renew?
         renew = self.GetSetting("renew", totype="bool")
         # info ...
-        logging.info(indent + "renew output: %s" % renew)
+        logging.info(f"{indent}renew output: {renew}")
 
         # pack floats as shorts?
         packed = self.GetSetting("output.packed", "bool")
@@ -245,21 +259,21 @@ class CSO_CoLocate(utopya.UtopyaRc):
         gattrs = {}
         attrnames = self.GetSetting("output.attrs").split()
         for key in attrnames:
-            gattrs[key] = self.GetSetting("output.attr.%s" % (key))
+            gattrs[key] = self.GetSetting(f"output.attr.{key}")
         # endfor
 
         # station file:
         locations_file = self.GetSetting("locations.file")
         # info ...
-        logging.info(indent + "location file: %s" % locations_file)
+        logging.info(f"{indent}location file: {locations_file}")
         # check ..
         if not os.path.isfile(locations_file):
-            logging.error("file not found: %s" % locations_file)
+            logging.error(f"file not found: {locations_file}")
             raise Exception
         # endif
         # special characters:
         sep = self.GetSetting("locations.sep", default=";")
-        comment = self.GetSetting("locations.sep", comment="#")
+        comment = self.GetSetting("locations.sep", default="#")
         # column names:
         loc_lon = self.GetSetting("locations.longitude")
         loc_lat = self.GetSetting("locations.latitude")
@@ -270,6 +284,16 @@ class CSO_CoLocate(utopya.UtopyaRc):
         # count:
         nloc = len(locations)
 
+        # skip some colunns from the file?
+        skip_columns = self.GetSetting("locations.skip_columns", default="").split()
+        # defined?
+        if len(skip_columns) > 0:
+            # info ...
+            logging.info(f"{indent}skip columns: {skip_columns}")
+            # remove columns:
+            locations = locations.drop(columns=skip_columns)
+        # endif
+
         # time resolution
         resolution = self.GetSetting("resolution")
 
@@ -284,7 +308,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
         for stype in stypes:
             # tempate for input file:
             sources[stype] = {}
-            sources[stype]["input"] = self.GetSetting("source.%s" % stype)
+            sources[stype]["input"] = self.GetSetting(f"source.{stype}")
         # endfor
 
         # first source type, this should have the footprints:
@@ -296,7 +320,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
         varsources = {}
         for vkey in vkeys:
             # source description:
-            varsources[vkey] = self.GetSetting("output.%s.source" % (vkey))
+            varsources[vkey] = self.GetSetting(f"output.{vkey}.source")
         # endfor
 
         # loop over times for output files:
@@ -309,13 +333,13 @@ class CSO_CoLocate(utopya.UtopyaRc):
             # create?
             if (not os.path.isfile(outfile)) or renew:
                 # info ...
-                logging.info(indent + "create %s .." % outfile)
+                logging.info(f"{indent}create {outfile} ..")
 
                 # averaging time intervals:
                 if resolution == "hourly":
                     # start and end times:
                     tt1 = pandas.date_range(outtime1, outtime2, freq="H")
-                    tt2 = t1 + pandas.Timedelta(hours=1)
+                    tt2 = tt1 + pandas.Timedelta(hours=1)
                 #
                 elif resolution == "daily":
                     # start and end times:
@@ -323,7 +347,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                     tt2 = tt1 + pandas.Timedelta(days=1)
                 #
                 else:
-                    logging.error('unsupported resolution "%s"' % resolution)
+                    logging.error(f"unsupported resolution '{resolution}'")
                     raise Exception
                 # endif
                 # mid:
@@ -332,7 +356,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                 nt = len(ttm)
 
                 # info ...
-                logging.info(indent + "init output file ...")
+                logging.info(f"{indent}init output file ...")
                 # init output file:
                 outf = cso_file.CSO_File()
                 # locations:
@@ -349,13 +373,13 @@ class CSO_CoLocate(utopya.UtopyaRc):
                 # store location info:
                 for key in locations.keys():
                     # variable name:
-                    vname = "location_%s" % key
+                    vname = f"location_{key}"
                     # add data:
                     outf.AddDataVariable(
                         vname,
                         ("location",),
                         values=locations[key],
-                        attrs={"long_name": "location %s" % key, "units": "1"},
+                        attrs={"long_name": f"location {key}", "units": "1"},
                     )
                 # endfor
                 # storage for number of selected pixels:
@@ -364,7 +388,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                     ("time", "location"),
                     values=numpy.zeros((nt, nloc), dtype="i2"),
                     attrs={
-                        "long_name": "number of pixels within %i km" % distance_km,
+                        "long_name": f"number of pixels within {distance_km} km",
                         "units": "1",
                     },
                 )
@@ -372,10 +396,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                 outf_setup = False
 
                 # init history:
-                line = "averages over pixels with %i km from locations specified in %s;" % (
-                    distance_km,
-                    locations_file,
-                )
+                line = f"averages over pixels within {distance_km} km from locations specified in {locations_file};"
                 line = line + "data files:"
                 for stype in stypes:
                     line = line + " " + sources[stype]["input"]
@@ -386,10 +407,8 @@ class CSO_CoLocate(utopya.UtopyaRc):
                 # assume input has hourly timestamps ..
                 times = pandas.date_range(outtime1, outtime2, freq="H")
                 # info ...
-                logging.info(
-                    indent + "search for input files in [%s,%s] .." % (outtime1, outtime2)
-                )
-                logging.info(indent + "  search times: %s, .., %s" % (times[0], times[-1]))
+                logging.info(f"{indent}search for input files in [{outtime1},{outtime2}] ..")
+                logging.info(f"{indent}  search times: {times[0]}, .., {times[-1]}")
                 # list of processed files:
                 donefiles = []
                 # loop over input times:
@@ -404,41 +423,41 @@ class CSO_CoLocate(utopya.UtopyaRc):
                         continue
 
                     # info ...
-                    logging.info(indent + "  read input file(s) ...")
+                    logging.info(f"{indent}  read input file(s) ...")
                     # loop:
                     for stype in stypes:
                         # input file:
                         infile = time.strftime(sources[stype]["input"])
                         # info ..
-                        logging.info(indent + "    %s" % infile)
+                        logging.info(f"{indent}    {infile}")
                         # read:
                         sources[stype]["file"] = cso_file.CSO_File(filename=infile)
                         ## update history:
-                        # history.insert( 0, 'pixel data from %s' % os.path.basename(infile) )
+                        # history.insert( 0, f"pixel data from {os.path.basename(infile)}")
                     # endfor
 
                     # filter:
                     selected, history = sources[stype0]["file"].SelectPixels(
-                        self.rcf, rcbase, indent=indent + "    "
+                        self.rcf, rcbase, indent=f"{indent}    "
                     )
                     # count:
                     npix = selected.size
                     nsel = selected.sum()
                     # none?
                     if nsel == 0:
-                        loggin.info(indent + "      no pixels selected ..")
+                        loggin.info(f"{indent}      no pixels selected ..")
                         continue
                     # endif
                     # info ..
-                    logging.info(indent + "      selected %i / %i pixels .." % (nsel, npix))
+                    logging.info(f"{indent}      selected {nsel} / {npix} pixels ..")
 
                     # target record:
-                    (ii,) = numpy.where((time >= tt1) & (time < tt2))
+                    (ii,) = numpy.where((time > tt1) & (time <= tt2))
                     # check ..
                     if len(ii) != 1:
-                        logging.error("time %s found %i times within bounds:" % (time, len(ii)))
+                        logging.error("time {time} found {len(ii)} times within bounds:")
                         for it in range(len(tt1)):
-                            logging.error("[%s,%s]" % (t1[it], t2[it]))
+                            logging.error("[{t1[it]},{t2[it]}]")
                         # endfor
                         raise Exception
                     # endif
@@ -446,7 +465,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                     itime = ii[0]
                     # info ..
                     logging.info(
-                        indent + "      target time interval: [%s,%s)" % (tt1[itime], tt2[itime])
+                        f"{indent}      target time interval: [{tt1[itime]},{tt2[itime]})"
                     )
 
                     # storage:
@@ -462,8 +481,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                         shp = das_in[vkey].shape[1:]
                         if numpy.prod(shp) != 1:
                             logging.error(
-                                'source variable "%s" has shape %s per pixel, but currently only scalars supported ...'
-                                % (vkey, str(shp))
+                                f"source variable '{vkey}' has shape {str(shp)} per pixel, but currently only scalars supported ..."
                             )
                             raise Exception
                         # endif
@@ -483,7 +501,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                     # endif # init output?
 
                     # info ..
-                    logging.info(indent + "      loop over locations ...")
+                    logging.info(f"{indent}      loop over locations ...")
                     # loop over locations:
                     for iloc in range(nloc):
                         # index:
@@ -492,7 +510,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                         lon = locations.at[indx, loc_lon]
                         lat = locations.at[indx, loc_lat]
                         # info ...
-                        logging.info(indent + "        location % (%f,%f)" % (iloc + 1, lon, lat))
+                        logging.info(f"{indent}        location {iloc+1} ({lon:7.2f},{lat:6.2f})")
                         # distance in m from pixels to location:
                         dist_m = cso_mapping.LonLatDistance(
                             lon,
@@ -505,15 +523,13 @@ class CSO_CoLocate(utopya.UtopyaRc):
                         # any?
                         if len(iipix) == 0:
                             # info ...
-                            logging.info(indent + "          no pixels within %i km" % distance_km)
+                            logging.info(f"{indent}          no pixels within {distance_km} km")
                         else:
                             # count:
                             nval = len(iipix)
                             # info ...
                             logging.info(
-                                indent
-                                + "          found %i selected pixels within %i km"
-                                % (nval, distance_km)
+                                f"{indent}          found {nval} selected pixels within {distance_km} km"
                             )
                             # add contribution:
                             outf.ds["npixel"].values[itime, iloc] += nval
@@ -534,7 +550,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                 # endfor # input times
 
                 # info ...
-                logging.info(indent + "  averages ...")
+                logging.info(f"{indent}  averages ...")
                 # times/locations with pixels found:
                 kk, ii = numpy.where(outf.ds["npixel"].values > 0)
                 # any?
@@ -547,7 +563,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
                 # endif # any pixels co-located
 
                 # info ...
-                logging.info(indent + "  write ...")
+                logging.info(f"{indent}  write ...")
                 # write:
                 outf.Write(
                     outfile,
@@ -560,7 +576,7 @@ class CSO_CoLocate(utopya.UtopyaRc):
 
             else:
                 # info ...
-                logging.info(indent + "keep  %s .." % outfile)
+                logging.info(f"{indent}keep  {outfile} ...")
             # endif # renew
 
         # endfor # outtimes
@@ -568,9 +584,9 @@ class CSO_CoLocate(utopya.UtopyaRc):
         # endfor # time loop
 
         # info ...
-        logging.info(indent + "")
-        logging.info(indent + "** end")
-        logging.info(indent + "")
+        logging.info(f"{indent}")
+        logging.info(f"{indent}** end")
+        logging.info(f"{indent}")
 
     # enddef __init__
 
@@ -690,9 +706,9 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase):
         cso_catalogue.CSO_CatalogueBase.__init__(self, rcfile, rcbase=rcbase, env=env)
 
         # info ...
-        logging.info(indent + "")
-        logging.info(indent + "** Plot Co-located time series")
-        logging.info(indent + "")
+        logging.info(f"{indent}")
+        logging.info(f"{indent}** Plot Co-located time series")
+        logging.info(f"{indent}")
 
         # time range:
         t1 = self.GetSetting("timerange.start", totype="datetime")
@@ -700,12 +716,12 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase):
 
         # info ...
         tfmt = "%Y-%m-%d %H:%M"
-        logging.info(indent + "timerange: [%s,%s]" % (t1.strftime(tfmt), t2.strftime(tfmt)))
+        logging.info(f"{indent}timerange: [%s,%s]" % (t1.strftime(tfmt), t2.strftime(tfmt)))
 
         # renew?
         renew = self.GetSetting("renew", totype="bool")
         # info ...
-        logging.info(indent + "renew output: %s" % renew)
+        logging.info(f"{indent}renew output: %s" % renew)
 
         # filter for input files:
         fpattern = self.GetSetting("input.files")
@@ -771,13 +787,13 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase):
             styles[varname] = {}
             for resolution in resolutions:
                 styles[varname][resolution] = self.GetSetting(
-                    "var.%s.%s.style" % (varname, resolution), totype="dict"
+                    f"var.{varname}.{resolution}.style", totype="dict"
                 )
             # endfor
         # endfor
 
         # info ...
-        logging.info(indent + "resample ...")
+        logging.info(f"{indent}resample ...")
         # resample (all locations):
         #   das['N']['yr'   ] = ..
         #   das['N']['times'] = ..
@@ -832,16 +848,16 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase):
         # endfor # time resols
 
         # loop over locations:
-        for iloc in range(ds.dims["location"]):
+        for iloc in range(ds.sizes["location"]):
             # info ..
-            logging.info("location %i / %i .." % (iloc + 1, ds.dims["location"]))
+            logging.info(f"{indent}location {iloc+1} / {ds.sizes['location']} ...")
 
             # target file, replace templates:
             figfile = figfile_template.format(**loci[iloc])
             # renew?
             if (not os.path.isfile(figfile)) or renew:
                 # info ..
-                logging.info("  create %s .." % figfile)
+                logging.info(f"{indent}  create {figfile} ...")
 
                 # new:
                 fig = plt.figure(figsize=figsize)
@@ -896,15 +912,15 @@ class CSO_CoLocatePlotTimeSeries(cso_catalogue.CSO_CatalogueBase):
 
             else:
                 # info ...
-                logging.info("  keep   %s .." % figfile)
+                logging.info(f"{indent}  keep   {figfile} ...")
             # endif
 
         # endfor # locations
 
         # info ...
-        logging.info(indent + "")
-        logging.info(indent + "** end")
-        logging.info(indent + "")
+        logging.info(f"{indent}")
+        logging.info(f"{indent}** end")
+        logging.info(f"{indent}")
 
     # enddef __init__
 
diff --git a/src/cso/cso_file.py b/src/cso/cso_file.py
index 09da2bb..22a69cb 100644
--- a/src/cso/cso_file.py
+++ b/src/cso/cso_file.py
@@ -36,6 +36,9 @@
 #   - added GetValue method
 #   Removed usage of undefined filename in error messages.
 #
+# 2025-04, Arjo Segers
+#   Enable zlib compression only for numerical data.
+#
 
 ########################################################################
 ###
@@ -606,9 +609,18 @@ class CSO_File(object):
         if complevel > 0:
             # loop over variables:
             for vname in self.ds.keys():
-                # enable zlib compression and set level:
-                self.ds[vname].encoding["zlib"] = True
-                self.ds[vname].encoding["complevel"] = complevel
+                # only numerical values:
+                if self.ds[vname].dtype in [
+                    numpy.int16,
+                    numpy.int32,
+                    numpy.int64,
+                    numpy.float32,
+                    numpy.float64,
+                ]:
+                    # enable zlib compression and set level:
+                    self.ds[vname].encoding["zlib"] = True
+                    self.ds[vname].encoding["complevel"] = complevel
+                # endif
             # endfor # variables
         # endif # comppress
 
-- 
GitLab


From 9ae526029718f432c14fdc6ea3901ccb5344863b Mon Sep 17 00:00:00 2001
From: Arjo Segers <Arjo.Segers@met.no>
Date: Thu, 1 May 2025 10:39:33 +0200
Subject: [PATCH 2/2] Improved check on arrays being numeric. Avoid warnings
 from packing in case of all-nan values.

---
 src/cso/cso_file.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/cso/cso_file.py b/src/cso/cso_file.py
index 22a69cb..6cd93a7 100644
--- a/src/cso/cso_file.py
+++ b/src/cso/cso_file.py
@@ -38,6 +38,7 @@
 #
 # 2025-04, Arjo Segers
 #   Enable zlib compression only for numerical data.
+#   Avoid warnings from packing in case of all-nan values.
 #
 
 ########################################################################
@@ -188,9 +189,16 @@ def Pack_DataArray(da, dtype="i2"):
 
     # only floats ...
     if da.dtype in [numpy.float32, numpy.float64]:
-        # value range, ignore nan's:
-        vmin = numpy.nanmin(da.values)
-        vmax = numpy.nanmax(da.values)
+        # should have some values ..
+        if numpy.any( ~ numpy.isnan(da.values) ):
+            # value range, ignore nan's:
+            vmin = numpy.nanmin(da.values)
+            vmax = numpy.nanmax(da.values)
+        else:
+            # dummy range:
+            vmin = 0.0
+            vmax = 0.0
+        #end if
         # target data type could be integer or float:
         if dtype.startswith("i"):
             # use absolute minimum to represent nans:
@@ -610,13 +618,8 @@ class CSO_File(object):
             # loop over variables:
             for vname in self.ds.keys():
                 # only numerical values:
-                if self.ds[vname].dtype in [
-                    numpy.int16,
-                    numpy.int32,
-                    numpy.int64,
-                    numpy.float32,
-                    numpy.float64,
-                ]:
+                dtype = self.ds[vname].dtype
+                if numpy.issubdtype(dtype,numpy.integer) or numpy.issubdtype(dtype,numpy.floating):
                     # enable zlib compression and set level:
                     self.ds[vname].encoding["zlib"] = True
                     self.ds[vname].encoding["complevel"] = complevel
-- 
GitLab