Select latest input file with latest processor version if multiple are selected. (24140838) · Commits · CAMS / CSO

src/cso/cso_dataspace.py

+10 −3

Original line number	Diff line number	Diff line
		@@ -54,6 +54,7 @@
		# 2026-04, Arjo Segers
		# Fixed definition of bounding box for global selection.
		# Added increasing delays to avoid rate limit errors from STAC catalogue inquiry.
		# Trap files that could not be downloaded.
		#


		@@ -754,7 +755,7 @@ class CSO_DataSpace_Downloader(object):
		# check ...
		if npfile != 1:
		print(f"ERROR - found {npfile} files in S3 bucket for product: {product}")
		raise Exception
		raise Exception(f"file not found in S3 bucket")
		#endif

		# loop over (single) files:
		@@ -864,8 +865,14 @@ class CSO_DataSpace_Downloader(object):

		# unknown ...
		except Exception as err:
		# check on known errors ..
		if "file not found in S3 bucket" in str(err):
		logging.warning(f"{indent} WARNING - could not download: {href}")
		break
		else:
		logging.error(f"{indent}unknown error:")
		logging.error(f"{indent} {str(err)}")
		#endif

		# endtry

src/cso/cso_s5p.py

+31 −13

Original line number	Diff line number	Diff line
		@@ -81,6 +81,9 @@
		# 2026-03, Arjo Segers
		# Updated selection of download source for Copernicus Dataspace.
		#
		# 2026-04, Arjo Segers
		# Select latest input file with latest processor version if multiple are selected.
		#

		########################################################################
		###
		@@ -3326,7 +3329,7 @@ class CSO_S5p_Download(utopya.UtopyaRc):
		df.sort_values("filename", inplace=True)

		# info ...
		logging.info(f"{indent}number of files : %i" % len(df))
		logging.info(f"{indent}number of files : {len(df)}")

		# list of ';' seperated selection expression:
		# (%{processor_version} == '020400') & (%{processing} == 'RPRO') ; ...
		@@ -3334,14 +3337,14 @@ class CSO_S5p_Download(utopya.UtopyaRc):
		# replace templates:
		# (xrec['processor_version'] == '020400') & (xrec['processing'] == 'RPRO') ; ...
		for key in df.keys():
		line = line.replace("%{" + key + "}", "xrec['" + key + "']")
		line = line.replace(f"%{{{key}}}", f"xrec['{key}']")
		# endfor
		# split:
		selections = line.split(";")
		# info ..
		logging.info("selection criteria (first with matching orbit is used):")
		for selection in selections:
		logging.info(" %s" % selection.strip())
		logging.info(" {selection.strip()}")
		# endif

		# skip some?
		@@ -3356,7 +3359,9 @@ class CSO_S5p_Download(utopya.UtopyaRc):
		logging.info(f"{indent}found %i orbits with overlap of time range .." % len(xdf))

		# orbit labels:
		orbits = xdf["orbit"].unique()
		orbits = list( xdf["orbit"].unique() )
		# sorted version:
		orbits.sort()

		# no download initialized yet:
		downloader = None
		@@ -3366,11 +3371,14 @@ class CSO_S5p_Download(utopya.UtopyaRc):
		# loop over orbits:
		for orbit in orbits:
		# info ...
		logging.info(indent + ' orbit "%s" ...' % orbit)
		logging.info(f"{indent} orbit '{orbit}'")

		# search for other records for same orbit:
		odf = xdf[xdf["orbit"] == orbit]

		# sort on processor version, newest first:
		odf.sort_values(by="processor_version",ascending=False)

		# storage for status label: "selected", "blacklisted", ...
		filestatus = {}
		# no match yet ..
		@@ -3380,6 +3388,8 @@ class CSO_S5p_Download(utopya.UtopyaRc):
		for selection in selections:
		# make empty again:
		selected = []
		# latest processor version:
		pversion = None
		# loop over records:
		for indx, xrec in odf.iterrows():
		# skip?
		@@ -3389,20 +3399,27 @@ class CSO_S5p_Download(utopya.UtopyaRc):
		# endif
		# evaluate expression including 'xrec[key]' values:
		if eval(selection):
		# already selected a record? then check on processor version:
		if len(selected) > 0:
		if xrec["processor_version"] < pversion:
		filestatus[xrec["filename"]] = "older processor version"
		continue
		#endif
		#endif
		# store:
		selected.append(xrec["filename"])
		filestatus[xrec["filename"]] = "selected"
		rec = xrec
		pversion = xrec["processor_version"]
		# endif
		# endfor # records
		# exactly one? then leave:
		if len(selected) == 1:
		break
		elif len(selected) > 1:
		logging.error(
		"found more than one orbit file matching selection: %s" % selection
		)
		logging.warning(f"{indent} found more than one orbit file matching selection: {selection}")
		for fname in selected:
		logging.error(" %s" % fname)
		logging.warning(f"{indent} {fname}")
		# endfor
		raise Exception
		# endif # number found
		@@ -3460,7 +3477,8 @@ class CSO_S5p_Download(utopya.UtopyaRc):
		# initialize download?
		if downloader is None:
		# init downloader based on url:
		if "dataspace.copernicus.eu" in href:
		# init downloader based on url:
		if href.startswith("s3://eodata/"):
		# download from Copernicus DataSpace:
		downloader = cso_dataspace.CSO_DataSpace_Downloader()
		#
		@@ -3469,7 +3487,7 @@ class CSO_S5p_Download(utopya.UtopyaRc):
		downloader = cso_pal.CSO_PAL_Downloader()
		#
		else:
		logging.error("no downloader class defined for url: {href}")
		logging.error(f"no downloader class defined for url: {href}")
		raise Exception
		# endif
		# endif
		@@ -3481,9 +3499,9 @@ class CSO_S5p_Download(utopya.UtopyaRc):

		# download might have failed ..
		if not os.path.isfile(input_file):
		logging.error(f"missing input file")
		logging.error(f" {input_file}")
		logging.error(f"missing input file: {input_file}")
		raise Exception
		#logging.warning(f"missing input file: {input_file}")
		# endif

		# endfor # input files