Merge branch 'merge-sesam' into 'master' (c98d43dd) · Commits · CAMS / CSO

src/cso/cso_colhub.py

+165 −14

Original line number	Diff line number	Diff line
		@@ -24,6 +24,9 @@
		# 2025-04, Arjo Segers
		# Changed imports for python packaging.
		#
		# 2025-09, Arjo Segers
		# Added "CSO_ColHubMirror_Cleanup" class.
		#

		########################################################################
		###
		@@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc):

		# endclass CSO_ColHubMirror_Missing


		########################################################################
		###
		### cleanup mirror extension
		###
		########################################################################


		class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc):

		"""
		Cleanup extension to mirror:

		* remove files that are (now) also available in mirror

		In the settings, define the listing files of the mirror and the extension,
		eventually replace time templates in filename by a specific date (default: today):

		<rcbase>.mirror.file : /work/inquire/mirror.csv
		!<rcbase>.mirror.filedate : 2025-01-24

		<rcbase>.mirror2.file : /work/inquire/mirror2.csv
		!<rcbase>.mirror2.filedate : 2025-01-24

		If files in the extension are also present in the main mirror archive,
		then remove these records from the listing.
		The corresponding files are renamed to an extension ``.bak``,
		or removed if the following flag is set::

		! remove duplicates? otherwise rename to '.bak':
		<rcbase>.mirror2.remove_duplicates : False

		Optionally define a creation mode for the (parent) directories::

		! directory creation mode:
		<rcbase>.dmode : 0o775


		"""

		def __init__(self, rcfile, rcbase="", env={}, indent=""):
		"""
		Convert data.
		"""

		# modules:
		import os
		import datetime

		# import glob
		import collections
		import fnmatch

		# tools:
		from . import cso_file

		# info ...
		logging.info(f"{indent}")
		logging.info(f"{indent}** cleanup mirror extension")
		logging.info(f"{indent}")

		# init base object:
		utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env)

		# table files in main archive:
		listfile1= self.GetSetting("mirror.file")
		# evaluate time?
		filedate1 = self.GetSetting(
		"mirror1.filedate", totype="datetime", default=datetime.datetime.now()
		)
		listfile1 = filedate1.strftime(listfile1)
		# read:
		listing1 = cso_file.CSO_Listing(listfile1)

		# table files in extra archive:
		listfile2= self.GetSetting("mirror2.file")
		# evaluate time?
		filedate2 = self.GetSetting(
		"mirror2.filedate", totype="datetime", default=datetime.datetime.now()
		)
		listfile2 = filedate2.strftime(listfile2)
		# read:
		listing2 = cso_file.CSO_Listing(listfile2)

		# remove duplicate files? if not, rename:
		remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool")

		# info ...
		logging.info(f"{indent}check on duplicate records in extension ...")

		# list of records to be removed:
		labels = []
		# loop over extra files:
		for fname2, row2 in listing2.df.iterrows():
		# check if also in main archive:
		if fname2 in listing1.df.index:
		# info ...
		logging.info(f"{indent}file '{fname2}' already in main archive ...")
		# actual file:
		href = row2["href"]
		# check ..
		if not os.path.isfile(href):
		logging.error(f"file not found at expected location: {href}")
		raise Exception
		#endif
		# what to do?
		if remove_duplicates:
		# info ..
		logging.info(f"{indent} remove: {href}")
		# remove:
		os.remove( href )
		else:
		# info ..
		logging.info(f"{indent} rename: {href}")
		# rename:
		os.rename( href, href+".bak" )
		#endif
		# store label for removal from listing:
		labels.append( fname2 )
		#endif
		#endfor

		# remove files from listing?
		if len(labels) == 0:
		# info ..
		logging.info(f"{indent} no duplicates found ...")
		else:
		# info ..
		logging.info(f"{indent}remove {len(labels)} records from listing ...")
		# remove records:
		listing2.df.drop( labels, inplace=True )
		# directory creation mode:
		dmode = self.GetSetting("dmode", totype="int", default=None)
		# save:
		listing2.Save(listfile2, dmode=dmode, indent=f"{indent} ")
		#endif

		# info ...
		logging.info(f"{indent}")
		logging.info(f"{indent}** end cleanup")
		logging.info(f"{indent}")

		# enddef __init__


		# endclass CSO_ColHubMirror_Cleanup


		########################################################################
		###
		### end

src/cso/cso_dataspace.py

+29 −24

Original line number	Diff line number	Diff line
		@@ -43,6 +43,9 @@
		# 2025-04, Arjo Segers
		# Changed imports for python packaging.
		#
		# 2025-10, Arjo Segers
		# Use mimetype of downloaded product to decide on postprocessing.
		#


		########################################################################
		@@ -775,6 +778,7 @@ class CSO_DataSpace_Downloader(object):
		import os
		import time
		import requests
		import magic
		import zipfile
		import shutil

		@@ -822,9 +826,25 @@ class CSO_DataSpace_Downloader(object):
		# rename:
		os.rename(tmpfile, product_file)

		# try to open product file;
		# first try if it is a zipfile:
		try:
		# file type:
		mimetype = magic.from_file( product_file, mime=True )
		# switch:
		#~ nc file:
		if mimetype == "application/x-hdf":

		# info ..
		logging.info(f"{indent}product is netcdf file, store ...")
		# this is the target netcdf file already;
		# create target dir if necessary:
		cso_file.CheckDir(output_file, dmode=dmode)
		# rename to destination:
		shutil.move(product_file, output_file)

		#~ zip file:
		elif mimetype == "application/zip":

		# info ..
		logging.info(f"{indent}product is zip file, unpack ...")
		# open as zipfile:
		arch = zipfile.ZipFile(product_file, mode="r")
		# loop over members, probably two files in a directory:
		@@ -859,27 +879,12 @@ class CSO_DataSpace_Downloader(object):
		logging.info(f"{indent}remove product file ...")
		# remove package:
		os.remove(product_file)
		#
		except Exception as err:
		# info ..
		msg = str(err)
		# logging.error("from download; message received:")
		# logging.error(" %s" % msg)
		# catch known problem ...
		if "File is not a zip file" in msg:
		# logging.warning(f"{indent}maybe download was interrupted, try again ...")
		# info ..
		logging.info(f"{indent}product is no zipfile, rename to output file ...")
		# this is probably the target file already;
		# create target dir if necessary:
		cso_file.CheckDir(output_file, dmode=dmode)
		# rename to destination:
		shutil.move(product_file, output_file)

		#~ unknown ...
		else:
		# quit with error:
		raise
		logger.error( f"unsupported mimetype '{mimetype}'" )
		raise Exception
		#endif
		# endtry

		# all ok, leave retry loop:
		break

src/cso/cso_s5p.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -2879,8 +2879,8 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
		if len(odf) == 0:
		continue
		elif len(odf) > 1:
		logging.error(f"found {len(odf)} records matching selection;" +
		" use finer selection, or something wrong in inquiry table?" )
		logging.error(f"found multiple records matching selection;" +
		" use finer selection, or something wrong in inquiry table(s)?" )
		raise Exception
		#endif
		# selected record:

src/cso/cso_file.py

+1 −1

File changed.

Contains only whitespace changes.