TNO Intern

Commit 24140838 authored by Arjo Segers's avatar Arjo Segers
Browse files

Select latest input file with latest processor version if multiple are selected.

parent 5cfc4686
Loading
Loading
Loading
Loading
+10 −3
Original line number Diff line number Diff line
@@ -54,6 +54,7 @@
# 2026-04, Arjo Segers
#   Fixed definition of bounding box for global selection.
#   Added increasing delays to avoid rate limit errors from STAC catalogue inquiry.
#   Trap files that could not be downloaded.
#


@@ -754,7 +755,7 @@ class CSO_DataSpace_Downloader(object):
        # check ...
        if npfile != 1:
            print(f"ERROR - found {npfile} files in S3 bucket for product: {product}")
            raise Exception
            raise Exception(f"file not found in S3 bucket")
        #endif

        # loop over (single) files:
@@ -864,8 +865,14 @@ class CSO_DataSpace_Downloader(object):

                # unknown ...
                except Exception as err:
                    # check on known errors ..
                    if "file not found in S3 bucket" in str(err):
                        logging.warning(f"{indent}  WARNING - could not download: {href}")
                        break
                    else:
                        logging.error(f"{indent}unknown error:")
                        logging.error(f"{indent}  {str(err)}")
                    #endif

                # endtry

+31 −13
Original line number Diff line number Diff line
@@ -81,6 +81,9 @@
# 2026-03, Arjo Segers
#   Updated selection of download source for Copernicus Dataspace.
#
# 2026-04, Arjo Segers
#   Select latest input file with latest processor version if multiple are selected.
#

########################################################################
###
@@ -3326,7 +3329,7 @@ class CSO_S5p_Download(utopya.UtopyaRc):
        df.sort_values("filename", inplace=True)

        # info ...
        logging.info(f"{indent}number of files : %i" % len(df))
        logging.info(f"{indent}number of files : {len(df)}")

        # list of ';' seperated selection expression:
        #     (%{processor_version} == '020400') & (%{processing} == 'RPRO') ; ...
@@ -3334,14 +3337,14 @@ class CSO_S5p_Download(utopya.UtopyaRc):
        # replace templates:
        #     (xrec['processor_version'] == '020400') & (xrec['processing'] == 'RPRO') ; ...
        for key in df.keys():
            line = line.replace("%{" + key + "}", "xrec['" + key + "']")
            line = line.replace(f"%{{{key}}}", f"xrec['{key}']")
        # endfor
        # split:
        selections = line.split(";")
        # info ..
        logging.info("selection criteria (first with matching orbit is used):")
        for selection in selections:
            logging.info("  %s" % selection.strip())
            logging.info("  {selection.strip()}")
        # endif

        # skip some?
@@ -3356,7 +3359,9 @@ class CSO_S5p_Download(utopya.UtopyaRc):
        logging.info(f"{indent}found %i orbits with overlap of time range .." % len(xdf))

        # orbit labels:
        orbits = xdf["orbit"].unique()
        orbits = list( xdf["orbit"].unique() )
        # sorted version:
        orbits.sort()

        # no download initialized yet:
        downloader = None
@@ -3366,11 +3371,14 @@ class CSO_S5p_Download(utopya.UtopyaRc):
        # loop over orbits:
        for orbit in orbits:
            # info ...
            logging.info(indent + '  orbit "%s" ...' % orbit)
            logging.info(f"{indent}  orbit '{orbit}'")

            # search for other records for same orbit:
            odf = xdf[xdf["orbit"] == orbit]

            # sort on processor version, newest first:
            odf.sort_values(by="processor_version",ascending=False)

            # storage for status label: "selected", "blacklisted", ...
            filestatus = {}
            # no match yet ..
@@ -3380,6 +3388,8 @@ class CSO_S5p_Download(utopya.UtopyaRc):
            for selection in selections:
                # make empty again:
                selected = []
                # latest processor version:
                pversion = None
                # loop over records:
                for indx, xrec in odf.iterrows():
                    # skip?
@@ -3389,20 +3399,27 @@ class CSO_S5p_Download(utopya.UtopyaRc):
                    # endif
                    # evaluate expression including 'xrec[key]' values:
                    if eval(selection):
                        # already selected a record? then check on processor version:
                        if len(selected) > 0:
                            if xrec["processor_version"] < pversion:
                                filestatus[xrec["filename"]] = "older processor version"
                                continue
                            #endif
                        #endif
                        # store:
                        selected.append(xrec["filename"])
                        filestatus[xrec["filename"]] = "selected"
                        rec = xrec
                        pversion = xrec["processor_version"]
                    # endif
                # endfor # records
                # exactly one? then leave:
                if len(selected) == 1:
                    break
                elif len(selected) > 1:
                    logging.error(
                        "found more than one orbit file matching selection: %s" % selection
                    )
                    logging.warning(f"{indent}  found more than one orbit file matching selection: {selection}")
                    for fname in selected:
                        logging.error("  %s" % fname)
                        logging.warning(f"{indent}    {fname}")
                    # endfor
                    raise Exception
                # endif  # number found
@@ -3460,7 +3477,8 @@ class CSO_S5p_Download(utopya.UtopyaRc):
                # initialize download?
                if downloader is None:
                    # init downloader based on url:
                    if "dataspace.copernicus.eu" in href:
                    # init downloader based on url:
                    if href.startswith("s3://eodata/"):
                        # download from Copernicus DataSpace:
                        downloader = cso_dataspace.CSO_DataSpace_Downloader()
                    #
@@ -3469,7 +3487,7 @@ class CSO_S5p_Download(utopya.UtopyaRc):
                        downloader = cso_pal.CSO_PAL_Downloader()
                    #
                    else:
                        logging.error("no downloader class defined for url: {href}")
                        logging.error(f"no downloader class defined for url: {href}")
                        raise Exception
                    # endif
                # endif
@@ -3481,9 +3499,9 @@ class CSO_S5p_Download(utopya.UtopyaRc):

            # download might have failed ..
            if not os.path.isfile(input_file):
                logging.error(f"missing input file")
                logging.error(f"  {input_file}")
                logging.error(f"missing input file: {input_file}")
                raise Exception
                #logging.warning(f"missing input file: {input_file}")
            # endif

        # endfor  # input files