Source code for cyto_ml.data.s3

"""Thin wrapper around the s3 object store with images and metadata"""

import os
from typing import Generator

import boto3
import pandas as pd
from dotenv import load_dotenv

# Load standard connection details via .env
load_dotenv()

AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID", "")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", "")
AWS_URL_ENDPOINT = os.environ.get("AWS_URL_ENDPOINT", "")


[docs] def boto3_client() -> boto3.Session: return boto3.client( "s3", aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, endpoint_url=AWS_URL_ENDPOINT, )
[docs] def bucket_keys( bucket_name: str, prefix: str = "/", delimiter: str = "/", start_after: str = "" ) -> Generator[str, None, None]: """Efficiently the contents of a bucket Lifted from this highly-rated SO answer: https://stackoverflow.com/a/54014862""" s3_paginator = boto3_client().get_paginator("list_objects_v2") prefix = prefix.lstrip(delimiter) start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after): for content in page.get("Contents", ()): yield content["Key"]
[docs] def image_index(location: str, suffix: str = ".tif") -> pd.DataFrame: """Find records in a bucket, return a DataFrame serving as an index Filter by optional file suffix, which by default is .tif""" index = bucket_keys(location) index = list(filter(lambda x: suffix in x, index)) return pd.DataFrame( [f"{os.environ['AWS_URL_ENDPOINT']}/{location}/{x}" for x in index], columns=["Filename"], )