diff --git a/CHANGELOG b/CHANGELOG
index 35db4a1614f26eaa912a88f872c56ee9e743bbc5..00aa344bb87df45991d4f01673d799477cf0642d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -466,3 +466,11 @@ Increased maximum number of records and introduced sort order to avoid that
search request return a different result when repeated.
py/cso_dataspace.py
+colhub branch
+~~~~~~~~~~~~~
+
+Support inquiry and use of ColHub mirror archive.
+ py/cso_colhub.py
+ py/cso_file.py
+ py/cso_s5p.py
+ py/cso.py
diff --git a/config/Copernicus/cso-s5p-no2.rc b/config/Copernicus/cso-s5p-no2.rc
index d9c3cbed8ceb7a54ec6b76858ec82fb6b7167735..5ae5685046af3119a3ba8774578776f0ad58494f 100644
--- a/config/Copernicus/cso-s5p-no2.rc
+++ b/config/Copernicus/cso-s5p-no2.rc
@@ -58,6 +58,47 @@ cso.s5p.no2.inquire-table-dataspace.download_url : https://zipper.dat
cso.s5p.no2.inquire-table-dataspace.output.file : ${my.work}/Copernicus-inquire/Copernicus_S5p_NO2_dataspace__%Y-%m-%d.csv
+!-----------------------------------------------------------------------
+! inquire colhub mirror archive
+!-----------------------------------------------------------------------
+
+! csv file that will hold records per file with:
+! - timerange of pixels in file
+! - orbit number
+cso.s5p.no2.inquire-table-colhub-mirror.output.file : ${my.work}/Copernicus-inquire/Copernicus_S5p_NO2_colhub-listing__%Y-%m-%d.csv
+
+! renew table if file already exists?
+cso.s5p.no2.inquire-table-colhub-mirror.renew : True
+
+! base path; example file:
+!
/x0/x0/x1/S5P_OFFL_L2__NO2____20230701T002408_20230701T020537_29604_03_020500_20230702T161349.nc
+cso.s5p.no2.inquire-table-colhub-mirror.dir : /lustre/storeB/project/ESAcolhub/production-backend-AOI/S5p/all
+! filename filter:
+cso.s5p.no2.inquire-table-colhub-mirror.pattern : S5P_*_L2__NO2_*.nc
+
+
+! ** create list of missing files
+
+! renew table if file already exists?
+cso.s5p.no2.inquire-table-colhub-missing.renew : True
+
+! csv file that will hold records per file with:
+! - timerange of pixels in file
+! - orbit number
+cso.s5p.no2.inquire-table-colhub-missing.output.file : ${my.work}/Copernicus-inquire/Copernicus_S5p_NO2_colhub-missing__%Y-%m-%d.csv
+
+! listing from DataSpace with all potentially available files:
+cso.s5p.no2.inquire-table-colhub-missing.all.file : ${cso.s5p.no2.inquire-table-dataspace.output.file}
+!cso.s5p.no2.inquire-table-colhub-missing.all.filedate : 2025-01-23
+
+! listing from current archive:
+cso.s5p.no2.inquire-table-colhub-missing.curr.file : ${cso.s5p.no2.inquire-table-colhub-mirror.output.file}
+!cso.s5p.no2.inquire-table-colhub-missing.curr.filedate : 2025-01-24
+
+! selection of orbits, see "convert" below:
+cso.s5p.no2.inquire-table-colhub-missing.selection : ${cso.s5p.no2.convert.selection}
+
+
!!-----------------------------------------------------------------------
!! inquire PAL portal
!!-----------------------------------------------------------------------
diff --git a/config/Copernicus/cso-user-settings.rc b/config/Copernicus/cso-user-settings.rc
index b07f313fa919320b5b8474c1c36481d3edc5fb1d..b7252fe00ab5fd031c968d0c908f42245140d24e 100644
--- a/config/Copernicus/cso-user-settings.rc
+++ b/config/Copernicus/cso-user-settings.rc
@@ -34,6 +34,7 @@ my.cso.convention : CF-1.7
! region name:
!my.region : globe
my.region : CAMS
+!my.region : xEMEP
! switch:
!....................................
@@ -62,6 +63,19 @@ my.region.north : 76.0
! size of map figures for this region, default size is (8,6)
my.region.figsize : (8,6)
+!....................................
+#elif "${my.region}" == "xEMEP"
+!....................................
+
+! outer bound of all known domains:
+my.region.west : -50.0
+my.region.east : 90.0
+my.region.south : 25.0
+my.region.north : 85.0
+
+! size of map figures for this region, default size is (8,6)
+my.region.figsize : (8,5)
+
!....................................
#else
#error unsupported my.region "${my.region}"
@@ -77,7 +91,7 @@ my.region.figsize : (8,6)
! full range, used for inquiry jobs:
my.full-timerange.start : 2018-01-01 00:00
-my.full-timerange.end : 2023-12-31 23:59
+my.full-timerange.end : 2024-12-31 23:59
! processing for selected period:
my.timerange.start : 2018-06-01 00:00
diff --git a/config/Copernicus/cso.rc b/config/Copernicus/cso.rc
index b773216ff69c63df3c91314158c2c8b688ce97a3..bc97fd187f8caabd0f70fdfca16710728c1d3c5e 100644
--- a/config/Copernicus/cso.rc
+++ b/config/Copernicus/cso.rc
@@ -82,6 +82,7 @@ cso.s5p.TRACER.inquire.class : utopya.UtopyaJobStep
! two or more tasks:
#if "TRACER" in ["no2","so2","hcho","co","o3-pr","o3-col"]
cso.s5p.TRACER.inquire.tasks : table-dataspace plot
+!cso.s5p.TRACER.inquire.tasks : table-colhub-mirror table-colhub-missing
#elif "TRACER" in ["so2-cobra"]
cso.s5p.TRACER.inquire.tasks : table-pal plot
#elif "TRACER" in ["chocho"]
@@ -93,6 +94,14 @@ cso.s5p.TRACER.inquire.tasks : table-glyretro table-pal pl
cso.s5p.TRACER.inquire.table-dataspace.class : cso.CSO_DataSpace_Inquire
cso.s5p.TRACER.inquire.table-dataspace.args : '${my.work}/rc/cso-s5p-TRACER.rc', \
rcbase='cso.s5p.TRACER.inquire-table-dataspace'
+!~ inquire file archive:
+cso.s5p.TRACER.inquire.table-colhub-mirror.class : cso.CSO_ColHubMirror_Inquire
+cso.s5p.TRACER.inquire.table-colhub-mirror.args : '${my.work}/rc/cso-s5p-TRACER.rc', \
+ rcbase='cso.s5p.TRACER.inquire-table-colhub-mirror'
+!~ create table with files that are missing:
+cso.s5p.TRACER.inquire.table-colhub-missing.class : cso.CSO_ColHubMirror_Missing
+cso.s5p.TRACER.inquire.table-colhub-missing.args : '${my.work}/rc/cso-s5p-TRACER.rc', \
+ rcbase='cso.s5p.TRACER.inquire-table-colhub-missing'
!~ inquire files available on PAL:
cso.s5p.TRACER.inquire.table-pal.class : cso.CSO_PAL_Inquire
cso.s5p.TRACER.inquire.table-pal.args : '${my.work}/rc/cso-s5p-TRACER.rc', \
diff --git a/doc/source/history.rst b/doc/source/history.rst
index 5b2a9e2d06f79231bc5cdabbf9686e270121c6be..69cc8c6b59be911da82852f08894d3a3a269b6cf 100644
--- a/doc/source/history.rst
+++ b/doc/source/history.rst
@@ -109,6 +109,9 @@ A summary of the versions and changes.
* | *v2.9*
| Initial support for S5P O3-profile product.
+
+* | *colhub branch*
+ | Support inquiry of ColHub mirror archive.
To be included
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 579b0a4b5cd3f96990c5f1e6e99c3b7224d83f48..16d2c87e52a7a35a04b8d4b977bc83d19dc7be3e 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -42,6 +42,7 @@ Contents
gridding
superobs
colocate
+ portals
pymods
history
documentation
diff --git a/doc/source/portals.rst b/doc/source/portals.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e72cbf166fb7f1b24ac50b475e969f2eebd909a
--- /dev/null
+++ b/doc/source/portals.rst
@@ -0,0 +1,20 @@
+
+.. Label between '.. _' and ':' ; use :ref:`text ` for reference
+.. _modules-and-classes:
+
+************
+Data portals
+************
+
+Overview of data portals from where satellite observations are downloaded.
+
+* The main portal for Sentinel data is the `Copernicus DataSpace `_.
+ See the :ref:`cso-dataspace` module for a detailed description.
+
+* The `Product Algorithm Laboratory `_, or more specific, the
+ `S5P-PAL Data Portal `_ provides pre-operational Sentinel-5P data.
+ See the :ref:`cso-pal` module for a detailed description.
+
+* The Norwegion `ColHub `_ archive is a (partial) mirror of the
+ `Copernicus DataSpace `_, with all orbits covering Norway and surrounding areas.
+ See the :ref:`cso-colhub` module for a detailed description.
diff --git a/doc/source/pymod-cso_colhub.rst b/doc/source/pymod-cso_colhub.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ab95fb93f08e1a9a6efd8590d393b52adb979171
--- /dev/null
+++ b/doc/source/pymod-cso_colhub.rst
@@ -0,0 +1,5 @@
+.. Documentation for module.
+
+.. Import documentation from ".py" file:
+.. automodule:: cso_colhub
+
diff --git a/doc/source/pymods.rst b/doc/source/pymods.rst
index daeb06504d66ae35eec8acae999c720fcfa7e26e..65b9010a6059b333ace6884bf370cfd7f2a6d588 100644
--- a/doc/source/pymods.rst
+++ b/doc/source/pymods.rst
@@ -11,7 +11,7 @@ Overview of the Python module(s) and classes.
CSO modules
===========
-The ``cso`` module provides the classes that do the work in the CSO pre-processor.
+The ``cso`` module provides the classes that do the work in the CSO pre- and post- processing.
.. The following are names of '.rst' files in the 'doc/sources' directory ;
this ensures that each sub-module is given a seperate page in the documentation.
@@ -29,6 +29,7 @@ Classes used for specific tasks are implemented in the ``cso_*`` modules.
pymod-cso_inquire
pymod-cso_dataspace
pymod-cso_pal
+ pymod-cso_colhub
pymod-cso_s5p
pymod-cso_file
pymod-cso_gridded
diff --git a/doc/source/s5p-no2.rst b/doc/source/s5p-no2.rst
index 6a04512643890467c58e0941b9e32e7bc802623c..e8da3dcd11219579a07ebbb1da7b728736addbaf 100644
--- a/doc/source/s5p-no2.rst
+++ b/doc/source/s5p-no2.rst
@@ -218,7 +218,7 @@ S5p/NO2 observations from KNMI have been available from at least these sources:
*This is the operational version.*
-* `Product Algorithm Laboratory `_, or more specif, the
+* `Product Algorithm Laboratory `_, or more specific, the
`S5P-PAL Data Portal `_;
see the :ref:`cso-pal` module for a detailed description.
diff --git a/py/cso.py b/py/cso.py
index a6350ea592aec5127c316d2f4217fbb3200f02da..b7f572b3f1a9a3f03c1ad3dc0d4b0656053d9cd3 100644
--- a/py/cso.py
+++ b/py/cso.py
@@ -109,6 +109,7 @@ and are defined according to the following hierchy:
from cso_file import *
from cso_inquire import *
from cso_dataspace import *
+from cso_colhub import *
from cso_pal import *
from cso_s5p import *
from cso_gridded import *
diff --git a/py/cso_colhub.py b/py/cso_colhub.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd1fd15e2e4499f7cd9763bdcfe9c50d72be4df
--- /dev/null
+++ b/py/cso_colhub.py
@@ -0,0 +1,471 @@
+#
+# Changes
+#
+# 2022-09, Arjo Segers
+# Updated documentation.
+#
+# 2023-06, Arjo Segers
+# Use "pandas.concat()" instead of "df.append()" to avoid warnings.
+#
+# 2023-08, Arjo Segers
+# Updated logging messages.
+#
+# 2023-08, Arjo Segers
+# Reformatted using 'black'.
+#
+# 2035-01, Arjo Segers
+# Extracted parts from former "cso_scihub.py" module to support "colhub",
+# the Norwegian mirror site.
+#
+
+########################################################################
+###
+### help
+###
+########################################################################
+
+"""
+.. _cso-colhub:
+
+*********************
+``cso_colhub`` module
+*********************
+
+The :py:mod:`cso_colhub` module provides classes for accessing data from the
+Norwegian `ColHub `_ archive.
+This is a (partial) mirror of the `Copernicus Open Access Hub `_
+with all orbits covering Norway and surrounding areas.
+
+The data can be accessed using a web-interface,
+but for Met Norway users also directly from the storage.
+This module provides the :py:class:`.CSO_ColHubMirror_Inquire` to inquire the storage
+to see what is already available.
+In addition, the :py:class:`.CSO_ColHubMirror_Missing` is available to create a table
+that lists files that are in the DataSpace but not yet in the mirror;
+these could be used to make additional downloads.
+
+
+Class hierchy
+=============
+
+The classes and are defined according to the following hierchy:
+
+* :py:class:`.UtopyaRc`
+
+ * :py:class:`.CSO_ColHubMirror_Inquire`
+ * :py:class:`.CSO_ColHubMirror_Missing`
+
+
+
+Classes
+=======
+
+
+"""
+
+
+########################################################################
+###
+### modules
+###
+########################################################################
+
+# modules:
+import logging
+
+# tools:
+import utopya
+
+
+########################################################################
+###
+### create listing file for file archive
+###
+########################################################################
+
+
+class CSO_ColHubMirror_Inquire(utopya.UtopyaRc):
+
+ """
+ Create *listing* file for files available in file archive.
+
+ The format is similar to the output of *inquiry* classes,
+ with per line a filename, the time range of pixels in the file,
+ and other information extracted from the filenames::
+
+ filename ;processing;start_time ;end_time ;orbit;collection;processor_version;href
+ S5P_RPRO_L2__CH4____20180430T001851_20180430T020219_02818_01_010301_20190513T141133.nc;RPRO ;2018-04-30T00:18:51;2018-04-30T02:02:19;02818;01 ;010301 ;/archive/mirror/S5P_RPRO_L2__CH4____20180430T001851_20180430T020219_02818_01_010301_20190513T141133.nc
+ S5P_RPRO_L2__CH4____20180430T020021_20180430T034349_02819_01_010301_20190513T135953.nc;RPRO ;2018-04-30T02:00:21;2018-04-30T03:43:49;02819;01 ;010301 ;/archive/mirror/S5P_RPRO_L2__CH4____20180430T020021_20180430T034349_02819_01_010301_20190513T135953.nc
+ :
+
+ This file could be used to scan for available versions and how they were produced.
+
+ In the settings, define the base directory of the archive::
+
+ .dir : /archive/mirror
+
+ This directory is recursively scanned using the :py:class:`os.walk` class on files with a specified form::
+
+ ! search S5P CH4 files:
+ .pattern : S5P_*_L2_CH4___*.nc
+
+ Specifiy the output file::
+
+ ! csv file that will hold records per file with:
+ ! - timerange of pixels in file
+ ! - orbit number
+ ! time templates are replaced with todays date
+ .file : /Scratch/Copernicus/S5p/listing-CH4__%Y-%m-%d.csv
+
+ An existing listing file is not replaced,
+ unless the following flag is set::
+
+ ! renew table?
+ .renew : True
+
+ """
+
+ def __init__(self, rcfile, rcbase="", env={}, indent=""):
+
+ """
+ Scan file archive.
+ """
+
+ # modules:
+ import os
+ import datetime
+ #import glob
+ import collections
+ import fnmatch
+
+ # tools:
+ import cso_file
+
+ # info ...
+ logging.info( f"{indent}")
+ logging.info( f"{indent}** create listing file")
+ logging.info( f"{indent}")
+
+ # init base object:
+ utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env)
+
+ # renew output?
+ renew = self.GetSetting("renew", totype="bool")
+
+ # table file to be written:
+ lst_file = self.GetSetting("output.file")
+ # evaluate current time:
+ lst_file = datetime.datetime.now().strftime(lst_file)
+
+ # create?
+ if (not os.path.isfile(lst_file)) or renew:
+ # info ..
+ logging.info( f"{indent}create %s ..." % lst_file)
+
+ # initiallize for (re)creation:
+ listing = cso_file.CSO_Listing()
+
+ # archive directory:
+ archive_dir = self.GetSetting("dir")
+ # check ..
+ if not os.path.isdir(archive_dir):
+ logging.error(f"archive directory not found: {archive_dir}")
+ raise Exception
+ #endif
+ # pattern for data files:
+ fpattern = self.GetSetting("pattern")
+
+ # recursively search for files:
+ for root, dirs, files in os.walk(archive_dir):
+
+ # loop over files:
+ for fname in files :
+ # data file?
+ if fnmatch.fnmatch( fname, fpattern ) :
+
+ # already in table?
+ if fname in listing:
+ # info ...
+ logging.info( f"{indent} keep entry %s ..." % fname)
+ else:
+ # info ...
+ logging.info( f"{indent} add entry %s ..." % fname)
+ # Example filename:
+ # S5P_RPRO_L2__CH4____20180430T001851_20180430T020219_02818_01_010301_20190513T141133.nc
+ #
+ # Some products have incorrect product id (should be 10 characters):
+ # S5P_OFFL_L2__CHOCHO___20200101T005246_20200101T023416_11487_01_010000_20210128.nc
+ # The extracted product id is then truncated to 10 characters.
+ #
+ # basename:
+ bname, ext = os.path.splitext(fname)
+ # extract:
+ try:
+ # split in 3 parts:
+ mission, processing, rest = bname.split("_", 2)
+ # extract product id, should be 10 chars:
+ product_id = rest[0:10]
+ # adhoc fix: some files have too long product id:
+ if rest.startswith("L2__CHOCHO__"):
+ rest = res[13:]
+ else :
+ rest = rest[11:]
+ #endif
+ # remaining parts:
+ (
+ start_time,
+ end_time,
+ orbit,
+ collection,
+ processor_version,
+ prod_time,
+ ) = rest.split("_")
+ except:
+ logging.error(f"could not extract filename parts; expected format:")
+ logging.error(f" S5P_RPRO_L2__CH4____20180430T001851_"
+ +"20180430T020219_02818_01_010301_20190513T141133" )
+ logging.error(f"found:")
+ logging.error(f" {bname}")
+ raise
+ # endif
+ # headers in "inquire" tables:
+ # orbit;start_time;end_time;processing;collection;processor_version;filename;href
+ # fill data record:
+ data = collections.OrderedDict()
+ tfmt = "%Y%m%dT%H%M%S"
+ data["orbit"] = orbit
+ data["start_time"] = datetime.datetime.strptime(start_time, tfmt)
+ data["end_time"] = datetime.datetime.strptime(end_time, tfmt)
+ data["processing"] = processing
+ data["collection"] = collection
+ data["processor_version"] = processor_version
+ data["href"] = os.path.join(root,fname)
+ # update record:
+ listing.UpdateRecord( fname, data, indent= f"{indent} ")
+ # endif # new record?
+
+ #endif # name of data file
+
+ #endfor # files
+
+ ## testing ...
+ #if len(listing) >= 100 :
+ # logging.warning( f"BREAK after {len(listing)} files ..." )
+ # break
+ # endif
+
+ #endfor # walk
+
+ # sort:
+ listing.Sort(by="orbit")
+ # save:
+ listing.Save(lst_file,indent= f"{indent} ")
+
+ else:
+ # info ..
+ logging.info( f"{indent}keep %s ..." % lst_file)
+ # endif
+
+ # info ...
+ logging.info( f"{indent}")
+ logging.info( f"{indent}** end archive listing")
+ logging.info( f"{indent}")
+
+ # enddef __init__
+
+
+# endclass CSO_ColHubMirror_Inquire
+
+
+
+
+########################################################################
+###
+### create listing files missing in archive
+###
+########################################################################
+
+
+class CSO_ColHubMirror_Missing(utopya.UtopyaRc):
+
+ """
+ Create *listing* file for files that are in one inquiry table but not in another one.
+ This could be used to complete a mirror archive.
+
+ The format is similar to the output of *inquiry* classes,
+ with per line a filename, the time range of pixels in the file, and other information extracted from the filenames::
+
+ filename ;start_time ;end_time ;orbit;processing;collection;processor_version;href
+ S5P_RPRO_L2__NO2____20180504T073130_20180504T091300_02879_03_020400_20221208T160012.nc;2018-05-04 07:31:30;2018-05-04 09:13:00;02879 ;RPRO ;03 ;020400 ;https://zipper.dataspace.copernicus.eu/odata/v1/Products(ae43b35c-0569-4e1f-b8cb-0afc49790716)/$value
+ :
+
+ In the settings, define the listing file with all available data, for example the result of an inquiry step;
+ eventually add a timestamp to replace templates in the filename::
+
+ .all.file : /work/inquire/Copernicus_S5p_NO2_dataspace__%Y-%m-%d.csv
+ !.all.filedate : 2025-01-24
+
+ Similar specify the name of the file that is listing the current mirror,
+ probably the output of the :py:class:`CSO_ColHubMirror_Inquire` class::
+
+ .curr.file : /work/inquire/Copernicus_S5p_NO2_colhub-mirror__%Y-%m-%d.csv
+ !.curr.filedate : 2025-01-24
+
+ Specify a selection filter; this defines which of the orbit files are actually needed::
+
+ ! Provide ';' seperated list of to decide if a particular orbit file should be processed.
+ ! If more than one file is available for a particular orbit (from "OFFL" and "RPRO" processing),
+ ! the file with the first match will be used.
+ ! The expressions should include templates '%{header}' for the column values.
+ !
+ .selection : (%{collection} == '03') and (%{processing} == 'RPRO') ; \\
+ (%{collection} == '03') and (%{processing} == 'OFFL')
+
+ Specifiy the output file::
+
+ ! csv file that will hold records per file with:
+ ! - timerange of pixels in file
+ ! - orbit number
+ ! time templates are replaced with todays date
+ .file : /work/inquire/Copernicus_S5p_NO2_colhub-mirror-missing__%Y-%m-%d.csv
+
+ An existing listing file is not replaced,
+ unless the following flag is set::
+
+ ! renew table?
+ .renew : True
+
+ """
+
+ def __init__(self, rcfile, rcbase="", env={}, indent=""):
+
+ """
+ Convert data.
+ """
+
+ # modules:
+ import os
+ import datetime
+ #import glob
+ import collections
+ import fnmatch
+
+ # tools:
+ import cso_file
+
+ # info ...
+ logging.info( f"{indent}")
+ logging.info( f"{indent}** create list of missing files")
+ logging.info( f"{indent}")
+
+ # init base object:
+ utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env)
+
+ # renew output?
+ renew = self.GetSetting("renew", totype="bool")
+
+ # table file to be written:
+ lst_file = self.GetSetting("output.file")
+ # evaluate current time:
+ lst_file = datetime.datetime.now().strftime(lst_file)
+
+ # create?
+ if (not os.path.isfile(lst_file)) or renew:
+ # info ..
+ logging.info( f"{indent}create %s ..." % lst_file)
+
+ # initiallize for (re)creation:
+ listing = cso_file.CSO_Listing()
+
+ # table with all available files:
+ listfile_all = self.GetSetting("all.file")
+ # evaluate time?
+ filedate = self.GetSetting( "all.filedate", totype="datetime", default=datetime.datetime.now() )
+ listfile_all = filedate.strftime(listfile_all)
+ # read:
+ listing_all = cso_file.CSO_Listing(listfile_all)
+
+ # table with currently already available files:
+ listfile_curr = self.GetSetting("curr.file")
+ # evaluate time?
+ filedate = self.GetSetting( "curr.filedate", totype="datetime", default=datetime.datetime.now() )
+ listfile_curr = filedate.strftime(listfile_curr)
+ # read:
+ listing_curr = cso_file.CSO_Listing(listfile_curr)
+
+ # extract orbits:
+ orbits = listing_all.GetValues( "orbit" ).unique()
+ # info ..
+ logging.info( f"{indent}found {len(orbits)} unique orbit numbers ..." )
+
+ # list of ';' seperated selection expression:
+ # (%{processor_version} == '020400') & (%{processing} == 'RPRO') ; ...
+ selection_expr = self.GetSetting("selection")
+ # info ..
+ logging.info(f"{indent}selection criteria (first with matching orbit is used):")
+ for expr in selection_expr.split(";") :
+ logging.info(f"{indent} {expr.strip()}")
+ # endif
+
+ # info ..
+ logging.info( f"{indent}loop over available orbits ..." )
+ # loop:
+ for orbit in orbits :
+ # info ..
+ logging.info( f"{indent} orbit '{orbit}' ..." )
+
+ # selection based on orbit and filter expression;
+ # returns None if no records are found:
+ xlst = listing_all.Select( orbit=orbit, expr=selection_expr, indent=" " )
+ # nothing selected?
+ if len(xlst) == 0 : continue
+
+ # loop over selected records:
+ for irec in range(len(xlst)):
+ # current:
+ rec = xlst.GetRecord(irec)
+ ## info ..
+ #logging.info( f"{indent} {fname}" )
+ # check if already available:
+ if rec["filename"] in listing_curr :
+ # info ...
+ logging.info( f"{indent} file already present ..." )
+ else :
+ # info ...
+ logging.info( f"{indent} file not present yet, add to list ..." )
+ # add record to list:
+ listing.UpdateRecord( rec["filename"], rec )
+ #endif
+ #endfor
+
+ # testing ...
+ #break
+ #if len(listing) > 0 :
+ # logging.warning(f"BREAK!")
+ # break
+ ##endif
+ #endfor
+
+ # save:
+ listing.Save(lst_file, indent= f"{indent} ")
+
+ else:
+ # info ..
+ logging.info( f"{indent}keep %s ..." % lst_file)
+ # endif # renew
+
+ # info ...
+ logging.info( f"{indent}")
+ logging.info( f"{indent}** end missing")
+ logging.info( f"{indent}")
+
+ # enddef __init__
+
+
+# endclass CSO_ColHubMirror_Missing
+
+########################################################################
+###
+### end
+###
+########################################################################
diff --git a/py/cso_file.py b/py/cso_file.py
index e926dbdd41bb094328836079742fde57ae3ee758..ca376d808221226bd38adfd2cc44d4e82812645e 100644
--- a/py/cso_file.py
+++ b/py/cso_file.py
@@ -21,7 +21,10 @@
# Do not pack coordinates. Updated comment.
#
# 2024-08, Arjo Segers
-# Updated formatting.
+# Updated formatting.
+#
+# 2025-01, Arjo Segers
+# Extened CSO_Listing class with selection and other methods.
#
########################################################################
@@ -1209,15 +1212,13 @@ class CSO_Listing(object):
2007/06/RAL-IASI-CH4_20070601T022359.nc;2007-06-01 02:23:59.512000000;2007-06-01 04:05:57.328000000
...
- Arguments:
+ Optional arguments:
- * ``filename`` : listing file, content is read if this file is already present unless ``renew==True`` ;
- if ``exist==True`` the file should exist
- * ``renew`` : (optinal) if ``True``, an existing file is ignored
+ * ``filename`` : listing file read into table
"""
- def __init__(self, filename, exist=False, renew=False, indent=""):
+ def __init__(self, filename=None, indent=""):
"""
Initialize empty table or read existing.
"""
@@ -1226,31 +1227,25 @@ class CSO_Listing(object):
import os
import pandas
- # store:
- self.filename = filename
- # base path:
- self.dirname = os.path.dirname(self.filename)
-
# csv style:
self.sep = ";"
# head for index column:
self.index_label = "filename"
+
+ # read?
+ if filename is not None :
- # check ..
- if exist:
+ # check ..
if not os.path.isfile(filename):
logging.error("listing file not found: %s" % filename)
raise Exception
# endif
- # endif
-
- # already present?
- if os.path.isfile(filename) and (not renew):
+
# info ...
- logging.info(indent + "read listing %s ..." % self.filename)
+ logging.info(f"{indent} read listing {filename} ...")
# read:
self.df = pandas.read_csv(
- self.filename,
+ filename,
sep=self.sep,
index_col=self.index_label,
parse_dates=["start_time", "end_time"],
@@ -1267,15 +1262,25 @@ class CSO_Listing(object):
# *
- def Close(self, indent=""):
+ def Save(self, filename, indent=""):
"""
Write table to file.
"""
-
+
# info ...
- logging.info(indent + "save listing %s ..." % self.filename)
- # save:
- self.df.to_csv(self.filename, sep=self.sep, index_label=self.index_label)
+ logging.info(f"{indent}save listing {filename} ...")
+
+ # create directory if needed:
+ CheckDir( filename )
+
+ # write all columns:
+ columns = list(self.df.keys())
+ # index only once ..
+ if self.index_label in columns :
+ columns.remove(self.index_label)
+
+ # save, also write the index column:
+ self.df.to_csv(filename, sep=self.sep, columns=columns, index_label=self.index_label)
# enddef Close
@@ -1309,6 +1314,23 @@ class CSO_Listing(object):
# *
+ def GetValues(self, name):
+ """
+ Return :py:class:`pandas.Series` object with all values for provided column ``name``.
+ """
+
+ # check ..
+ if name not in self.df.keys() :
+ logging.error( f"column '{name}' not found in: {self.filename}" )
+ raise Exception
+ #endif
+ # extract:
+ return self.df[name]
+
+ # enddef GetRecord
+
+ # *
+
def __contains__(self, key):
"""
Returns True if key is in the table index.
@@ -1320,6 +1342,17 @@ class CSO_Listing(object):
# *
+ def keys(self):
+ """
+ Returns list of column names.
+ """
+
+ return self.df.keys()
+
+ # enddef keys
+
+ # *
+
def Cleanup(self, indent=""):
"""
Remove records from table if filename is not present anymore.
@@ -1423,9 +1456,9 @@ class CSO_Listing(object):
# *
- def Select(self, tr=None, method="overlap"):
+ def Select(self, tr=None, method="overlap", expr=None, blacklist=[], indent="", **kwargs ):
"""
- Return dataframe with selected records.
+ Return :py:class:`CSO_Listing` objects with selection of records.
Optional arguments:
@@ -1435,13 +1468,28 @@ class CSO_Listing(object):
* ``method='overlap'`` (default) : select records that have pixels overlapping with interval
* ``method='middle'`` : select pixels with the middle of ``start_time`` and ``end_time``
within the inerval ``(t1,t2]``
+
+ * ``name=value`` arguments select records based on column values
+
+ * The exprssion ``expr`` provides a list of ';'-seperated selection expressions
+ with templates ``%{..}`` for the column names::
+
+ (%{processor_version} == '020400') & (%{processing} == 'RPRO') ; ...
+
+ This is evaluted after the previously described selections.
+ The result should be None or exactly one record.
+ Eventually skip files in ``blacklists``.
+
"""
# modules:
+ import os
import pandas
# init result with entire dataset:
df = self.df
+
+ # *
# time range specified?
if tr is not None:
@@ -1463,9 +1511,103 @@ class CSO_Listing(object):
raise Exception
# endif
# endif
+
+ # *
+
+ # other selections:
+ for key,value in kwargs.items():
+ # check ..
+ if key not in df.keys():
+ logging.error( f"key '{key}' not defined in listing: {self.filename}" )
+ raise Exception
+ #endif
+ # select:
+ df = df[ df[key] == value ]
+ #endfor
+
+ # *
+
+ # evaluate selection expression?
+ if expr is not None :
+ # replace templates:
+ for key in self.df.keys():
+ expr = expr.replace("%{" + key + "}", "xrec['" + key + "']")
+ # endfor
+ # split:
+ selections = expr.split(";")
+
+ # storage for status label: "selected", "blacklisted", ...
+ filestatus = {}
+ # no match yet ..
+ seleted = []
+ # loop over selection criteria,
+ # this should give either none or a single file:
+ for selection in selections:
+ # make empty again:
+ selected = []
+ # loop over records:
+ for indx, xrec in df.iterrows():
+ # skip?
+ if os.path.basename(indx) in blacklist:
+ filestatus[indx] = "blacklisted"
+ continue
+ # endif
+ # evaluate expression including 'xrec[key]' values:
+ if eval(selection):
+ selected.append(indx)
+ filestatus[indx] = "selected"
+ rec = xrec
+ # endif
+ # endfor # records
+ # exactly one? then leave:
+ if len(selected) == 1:
+ break
+ elif len(selected) > 1:
+ logging.error( f"found more than one record matching selection: {selection}" )
+ for fname in selected:
+ logging.error(f" {fname}")
+ # endfor
+ raise Exception
+ # endif # number found
+ # endfor # selection criteria
+
+ # info ...
+ logging.info(f"{indent}available records(s):")
+ # loop:
+ for fname, row in df.iterrows():
+ line = fname
+ if fname in filestatus.keys():
+ line = line + " [" + filestatus[fname] + "]"
+ logging.info(f"{indent} {line}")
+ # endfor
+ # no match?
+ if len(selected) == 0:
+ # info ...
+ logging.warning(" no match with any selection criterium ...")
+ ## degug:
+ # for selection in selections :
+ # logging.warning( ' %s' % selection.strip() )
+ # logging.warning( ' record:' )
+ # for key in rec.keys() :
+ # logging.warning( ' "%s" : "%s"' % (key,rec[key]) )
+ # create empty dataframe as result:
+ df = pandas.DataFrame(columns=df.columns)
+ else :
+ # extract selected record:
+ df = df.loc[[selected[0]]]
+ #endif
+
+ #endif
+
+ # *
+
+ # return as listing:
+ lst = CSO_Listing()
+ lst.df = df
+
# ok
- return df
+ return lst
# enddef Select
@@ -1492,11 +1634,27 @@ class CSO_Listing(object):
self.df = pandas.concat((self.df, xdf.set_index(self.index_label)))
else:
# append:
- self.df = pandas.concat((self.df, lst.df))
+ if len(self.df) == 0 :
+ self.df = lst.df
+ else :
+ self.df = pandas.concat((self.df, lst.df))
+ #endif
# endif
# enddef Append
+
+ # *
+
+ def Sort(self,by="filename"):
+
+ """
+ Sort listing table by filename or other key.
+ """
+ # sort inplace:
+ self.df.sort_values(by, inplace=True)
+
+ #endef Sort
# endclass CSO_Listing
diff --git a/py/cso_s5p.py b/py/cso_s5p.py
index 6dd1d3250173e1ac65f4f6ac7524edd3b9018aad..fbdd8faa9d9ffae6dc2811550f075d2ec0dd5f49 100644
--- a/py/cso_s5p.py
+++ b/py/cso_s5p.py
@@ -54,6 +54,9 @@
# Fixed replacement of units for special processing 'square_sum'.
# Fixed check on inquire filedates in case of multiple inquire files.
#
+# 2025-01, Arjo Segers
+# Use updated CSO_Listing class.
+#
########################################################################
@@ -2732,7 +2735,7 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
raise Exception
# endif
# init storage:
- dfs = []
+ listing = cso_file.CSO_Listing()
# loop:
for ifile in range(len(filename__templates)):
# current:
@@ -2746,31 +2749,16 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
# endif
# expand time templates
filename = t0.strftime(filename__template)
- # check ..
- if not os.path.isfile(filename):
- logging.error("inquire table not found: %s" % filename)
- raise Exception
- # endif
- # info ..
- logging.info(f"{indent}read inquire table: %s" % filename)
# read:
- dfs.append(
- pandas.read_csv(
- filename,
- sep=";",
- skip_blank_lines=True,
- parse_dates=["start_time", "end_time"],
- dtype="str",
- )
- )
+ lst = cso_file.CSO_Listing( filename )
+ # add:
+ listing.Append( lst )
# endfor # filenames
- # combine:
- df = pandas.concat(dfs)
# sort by filename:
- df = df.sort_values("filename")
+ listing.Sort()
# info ...
- logging.info(f"{indent}number of files : %i" % len(df))
+ logging.info(f"{indent}number of files : %i" % len(listing))
## selected proecessings:
# processings = self.GetSetting( 'processings' ).split()
@@ -2779,18 +2767,11 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
# list of ';' seperated selection expression:
# (%{processor_version} == '020400') & (%{processing} == 'RPRO') ; ...
- line = self.GetSetting("selection")
- # replace templates:
- # (xrec['processor_version'] == '020400') & (xrec['processing'] == 'RPRO') ; ...
- for key in df.keys():
- line = line.replace("%{" + key + "}", "xrec['" + key + "']")
- # endfor
- # split:
- selections = line.split(";")
+ selection_expr = self.GetSetting("selection")
# info ..
logging.info("selection criteria (first with matching orbit is used):")
- for selection in selections:
- logging.info(" %s" % selection.strip())
+ for expr in selection_expr.split(";"):
+ logging.info(f" {expr.strip()}" )
# endif
# skip some?
@@ -2822,12 +2803,12 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
complevel = self.GetSetting("output.complevel", "int")
# select records with start time inside time range:
- xdf = df[(df["start_time"] >= t1) & (df["start_time"] <= t2)]
+ xlst = listing.Select( tr=(t1,t2) )
# info ..
- logging.info(f"{indent}found %i orbits with overlap of time range .." % len(xdf))
+ logging.info(f"{indent}found {len(xlst)} orbits with overlap of time range .." )
# oribit labels:
- orbits = xdf["orbit"].unique()
+ orbits = xlst.GetValues("orbit").unique()
# no download initialized yet:
downloader = None
@@ -2839,70 +2820,14 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
# info ...
logging.info(indent + ' orbit "%s" ...' % orbit)
- # search for other records for same orbit:
- odf = xdf[xdf["orbit"] == orbit]
-
- # storage for status label: "selected", "blacklisted", ...
- filestatus = {}
- # no match yet ..
- seleted = []
- # loop over selection criteria,
- # this should give either none or a single file:
- for selection in selections:
- # make empty again:
- selected = []
- # loop over records:
- for indx, xrec in odf.iterrows():
- # skip?
- if os.path.basename(xrec["filename"]) in blacklist:
- filestatus[xrec["filename"]] = "blacklisted"
- continue
- # endif
- # evaluate expression including 'xrec[key]' values:
- if eval(selection):
- selected.append(xrec["filename"])
- filestatus[xrec["filename"]] = "selected"
- rec = xrec
- # endif
- # endfor # records
- # exactly one? then leave:
- if len(selected) == 1:
- break
- elif len(selected) > 1:
- logging.error(
- "found more than one orbit file matching selection: %s" % selection
- )
- for fname in selected:
- logging.error(" %s" % fname)
- # endfor
- raise Exception
- # endif # number found
- # endfor # selection criteria
-
- # info ...
- logging.info(f"{indent} available file(s):")
- # loop:
- for fname in odf["filename"]:
- line = fname
- if fname in filestatus.keys():
- line = line + " [" + filestatus[fname] + "]"
- logging.info(f"{indent} " + line)
- # endfor
-
- # no match?
- if len(selected) == 0:
- # info ...
- logging.warning(" no match with any selection criterium; next orbit ...")
- ## degug:
- # for selection in selections :
- # logging.warning( ' %s' % selection.strip() )
- # logging.warning( ' record:' )
- # for key in rec.keys() :
- # logging.warning( ' "%s" : "%s"' % (key,rec[key]) )
- # next orbit:
+ # select single orbit matching expression:
+ odf = xlst.Select( orbit=orbit, expr=selection_expr, blacklist=blacklist, indent=f" " )
+ # no orbit found? next:
+ if len(odf) == 0 :
continue
- # endif
-
+ # selected record:
+ rec = odf.GetRecord(0)
+
# start time of orbit:
t0 = rec["start_time"]
@@ -3018,68 +2943,82 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
if create:
# keep list of downloaded files:
downloads = []
+
+ # source location, either url or local mirror:
+ href = rec["href"]
+ # local file?
+ if os.path.isfile(href):
- # input dir:
- input_dir = t0.strftime(input_dir__template)
- # replace templates:
- for key in rec.keys():
- if type(rec[key]) == str:
- input_dir = input_dir.replace("%{" + key + "}", rec[key])
- # endif
- # endfor
- # full path:
- input_file = os.path.join(input_dir, rec["filename"])
+ # this is the input file:
+ input_file = href
+ # info ..
+ logging.info(" input file: %s" % input_file)
+
+ else:
+
+ # input dir:
+ input_dir = t0.strftime(input_dir__template)
+ # replace templates:
+ for key in rec.keys():
+ if type(rec[key]) == str:
+ input_dir = input_dir.replace("%{" + key + "}", rec[key])
+ # endif
+ # endfor
+ # full path:
+ input_file = os.path.join(input_dir, rec["filename"])
- # info ..
- logging.info(" input file: %s" % input_file)
- # check ..
- if not os.path.isfile(input_file):
# info ..
- logging.info(" not present yet, download ...")
- # download url:
- href = rec["href"]
- # initialize download?
- if downloader is None:
- # init downloader based on url:
- if "dataspace.copernicus.eu" in href:
- # download from Copernicus DataSpace:
- downloader = cso_dataspace.CSO_DataSpace_Downloader()
- #
- elif "s5p-pal.com" in href:
- # download from PAL:
- downloader = cso_pal.CSO_PAL_Downloader()
- #
- else:
- logging.error("no downloader class defined for url: {href}")
- raise Exception
+ logging.info(" input file: %s" % input_file)
+ # check ..
+ if not os.path.isfile(input_file):
+ # info ..
+ logging.info(" not present yet, download ...")
+ # download url:
+ href = rec["href"]
+ # initialize download?
+ if downloader is None:
+ # init downloader based on url:
+ if "dataspace.copernicus.eu" in href:
+ # download from Copernicus DataSpace:
+ downloader = cso_dataspace.CSO_DataSpace_Downloader()
+ #
+ elif "s5p-pal.com" in href:
+ # download from PAL:
+ downloader = cso_pal.CSO_PAL_Downloader()
+ #
+ else:
+ logging.error(f"no downloader class defined for url: {href}")
+ raise Exception
+ # endif
# endif
+ # download ...
+ downloader.DownloadFile(href, input_file, indent=" ")
+ # store name:
+ downloads.append(input_file)
# endif
- # download ...
- downloader.DownloadFile(href, input_file, indent=" ")
- # store name:
- downloads.append(input_file)
- # endif
- # download might have failed ..
- if not os.path.isfile(input_file):
- # write error file or raise error?
- if with_error_files:
- # info ..
- logging.warning(f"{indent} missing input file, write error file ...")
- # write error file:
- with open(output_errfile, "w") as f:
- f.write("missing file: %s\n" % input_file)
- # endwith
- # next:
- continue
- else:
- # info ..
- logging.error(f"missing input file")
- logging.error(f" {input_file}")
- logging.error(f"enable creation of *error files* to not break on this")
- raise Exception
+ # download might have failed ..
+ if not os.path.isfile(input_file):
+ # write error file or raise error?
+ if with_error_files:
+ # info ..
+ logging.warning(f"{indent} missing input file, write error file ...")
+ # write error file:
+ with open(output_errfile, "w") as f:
+ f.write("missing file: %s\n" % input_file)
+ # endwith
+ # next:
+ continue
+ else:
+ # info ..
+ logging.error(f"missing input file")
+ logging.error(f" {input_file}")
+ logging.error(f"enable creation of *error files* to not break on this")
+ raise Exception
+ # endif
# endif
- # endif
+
+ # endif # local mirror or remote source file
# info ...
logging.info(f"{indent} open ...")
@@ -3328,7 +3267,7 @@ class CSO_S5p_Download(utopya.UtopyaRc):
# combine:
df = pandas.concat(dfs)
# sort by filename:
- df = df.sort_values("filename")
+ df.sort_values("filename", inplace=True)
# info ...
logging.info(f"{indent}number of files : %i" % len(df))