Dataset#

Info#

The dataset class is an object-oriented way to use the Foundry DevTools API clients.

All methods which require a transaction to work will create one automatically and commit it automatically. This works via the transaction_context context manager (click the link to see the documentation for it).

Examples:

with ds.transaction_context():
    # will start the transaction
    print(ds.transaction) # will print the dictionary of the transaction object
    ds.put_file(...)
    ds.remove_file(...)
    # will commit the transaction unless an error happened

print(ds.transaction) # will throw an error as there is currently no open transaction

You can also chain multiple actions together. Only works with methods that return the dataset, e.g. list_files obviously does not return the Dataset class but the list of files and so on.

ds = ctx.get_dataset(...)
ds.start_transaction().put_file(...).upload_schema(...).commit_transaction()

And if you create a transaction manually before using the context it won’t do anything.

ds.start_transaction()
with ds.transaction_context():
    # will not start a new transaction
    print(ds.transaction) # will print the transaction started earlier
    # will not commit or abort the transaction

print(ds.transaction) # still accessible and open

# you'll need to close it manually 
ds.commit_transaction()

The state/attributes of resources like the Dataset class may get out of “sync” if you have multiple instances for the same dataset, or modify the dataset in other ways than through the Dataset object.

ds.sync()

Uploading a dataset to Foundry#

Saves a Pandas or PySpark dataframe to Foundry.

from foundry_dev_tools import FoundryContext
import pandas as pd

df = pd.DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/test_output_dataset", create_if_not_exist=True)
dataset.save_dataframe(df)
from foundry_dev_tools import CachedFoundryClient

cached_client = CachedFoundryClient()
cached_client.save_dataset(df, '/path/to/test_output_dataset',
                           branch='master', exists_ok=True, mode='SNAPSHOT')

Uploading a folder to a dataset in Foundry#

Upload the complete content of a local folder to a dataset in Foundry

from foundry_dev_tools import FoundryContext
from pathlib import Path

ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/test_folder_upload", create_if_not_exist=True)
dataset.upload_folder(Path("/path/to/folder-to-upload"))
import os
from foundry_dev_tools import FoundryRestClient

upload_folder = "/path/to/folder-to-upload"
target_dataset_path = "/path/to/test_folder_upload"

file_paths = [file for file in Path(upload_folder).rglob("*") if file.is_file() and not file.name.startswith(".")]
dataset_paths_in_foundry = [str(file_path.relative_to(upload_folder)) for file_path in file_paths]
path_file_dict = dict(zip(dataset_paths_in_foundry, file_paths))

rest_client = FoundryRestClient()
dataset_rid = rest_client.get_dataset_rid(dataset_path=target_dataset_path)
transaction_rid = rest_client.open_transaction(dataset_rid=dataset_rid,
                                               mode='UPDATE',
                                               branch='master')
rest_client.upload_dataset_files(dataset_rid=dataset_rid,
                                 transaction_rid=transaction_rid,
                                 path_file_dict=path_file_dict)
rest_client.commit_transaction(dataset_rid, transaction_rid)

Save model or other type of python object#

import pickle

from foundry_dev_tools import FoundryContext

model_obj = """<PMML xmlns="http://www.dmg.org/PMML-4_1" version="4.1"></PMML>"""  # can be any python object that can be pickled
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1", create_if_not_exist=True)
pickled_model = pickle.dumps(model_obj)
dataset.put_file("model.pickle", file_data=pickled_model)

from foundry_dev_tools import CachedFoundryClient
import pickle

model_obj = """<PMML xmlns="http://www.dmg.org/PMML-4_1" version="4.1"></PMML>"""
cached_client = CachedFoundryClient()
cached_client.save_model(model_obj, dataset_path_or_rid='/path/to/playground/model1',
                         branch='master', exists_ok=True, mode='SNAPSHOT')

Load model or other type of file (using temporary file)#

from foundry_dev_tools import FoundryContext
from pathlib import Path
import pickle

ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1")
model_file = dataset.download_file(output_directory=Path("/tmp/model"),path_in_dataset="model.pickle")

with model_file.open("rb") as model:
    print(pickle.load(model))

from foundry_dev_tools import FoundryRestClient
import pickle

rest_client = FoundryRestClient()
rid = rest_client.get_dataset_rid('/path/to/playground/model1')
model_file = rest_client.download_dataset_files(dataset_rid=rid, output_directory='/tmp/model', branch='master')[0]

with open(model_file, 'rb') as file:
    print(pickle.load(file))

Load model or other type of file (in-memory)#

from foundry_dev_tools import FoundryContext
import pickle

ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1")
model_file_bytes = dataset.get_file("model.pickle")
print(pickle.loads(model_file_bytes))

from foundry_dev_tools import FoundryRestClient
import pickle

rest_client = FoundryRestClient()
rid = rest_client.get_dataset_rid('/path/to/playground/model1')
model_file_bytes = rest_client.download_dataset_file(dataset_rid=rid,
                                                     output_directory=None,
                                                     foundry_file_path='model.pickle',
                                                     view='master')
print(pickle.loads(model_file_bytes))

Download a dataset to a temporary folder#

Downloads to a temporary folder and reading parquet dataset with pandas/pyarrow. When exiting the context, the temp files are automatically deleted.

from foundry_dev_tools import FoundryContext
import pandas as pd

ctx = FoundryContext()
dataset = ctx.get_dataset("ri.foundry.main.dataset...")
with dataset.download_files_temporary() as tmp_dir:
    df = pd.read_parquet(tmp_dir)

print(df.shape)
from foundry_dev_tools import FoundryRestClient
import pandas as pd

rest_client = FoundryRestClient()
rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xx-xxxxxxxxxx"
with rest_client.download_dataset_files_temporary(dataset_rid=rid, view='master') as temp_folder:
    df = pd.read_parquet(temp_folder)

print(df.shape)

Download only few files from dataset#

You can simply specify the list of files you want to download in download_dataset_files

rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xxx-xxxxxxxxx"
ds = ctx.get_dataset(rid)
ds.download_files(output_directory=Path("/path/to/only_few_files"),paths_in_dataset={"file1.png","file2.png"})
rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xxx-xxxxxxxxx"
rest_client.download_dataset_files(dataset_rid=rid, output_directory='/paht/to/only_few_files', files=['file1.png', 'file2.png'], branch='master')

Polars#

There are three ways to get Polars data from a Foundry dataset. Choose the one that fits your workload:

Method

Data path

Evaluation

Best for

to_polars()

FoundrySqlServer

Eager (full dataset)

Quick exploration of small-medium datasets

query_foundry_sql(..., "polars")

FoundrySqlServer

Eager (SQL-filtered)

Aggregations, joins, complex SQL queries

to_lazy_polars()

Direct parquet scan (S3)

Lazy

Filtering/selection on large datasets; portable code for Foundry transforms

to_lazy_polars() scans parquet files directly via the S3-compatible API using polars.scan_parquet. Combined with Polars’ lazy evaluation, this enables predicate pushdown: filters applied to the LazyFrame are pushed down to the parquet reader, so only relevant data is read from storage.

The lazy Polars API uses the same syntax as Foundry lightweight transforms, so code written with to_lazy_polars() can be moved into a Foundry transform without rewriting.

Eager via FoundrySqlServer#

from foundry_dev_tools import FoundryContext
import polars as pl

ctx = FoundryContext()
ds = ctx.get_dataset_by_path("/path/to/test_dataset")

# Fetch the full dataset
df = ds.to_polars()
print(df)

# Or use SQL to filter/aggregate server-side
df = ds.query_foundry_sql("SELECT * WHERE age > 25", return_type="polars")
print(df)
from foundry_dev_tools import FoundryRestClient

rest_client = FoundryRestClient()
arrow_table = rest_client.query_foundry_sql(
    "SELECT * FROM `/path/to/test_dataset`",
    branch="master",
    return_type="arrow",
)
import polars as pl

df = pl.from_arrow(arrow_table)
print(df)

Lazy via direct S3 parquet scan#

from foundry_dev_tools import FoundryContext
import polars as pl

ctx = FoundryContext()
ds = ctx.get_dataset_by_path("/path/to/test_dataset")
lazy_df: pl.LazyFrame = ds.to_lazy_polars()

# Perform lazy operations (not executed yet)
result = lazy_df.filter(pl.col("age") > 25).select("name", "age")

# Execute and collect results
df = result.collect()
print(df)

DuckDB Table from Spark SQL dialect#

Queries the Foundry SQL server with Spark SQL dialect, load arrow stream using duckdb.

from foundry_dev_tools import FoundryContext

ctx = FoundryContext()
ds = ctx.get_dataset_by_path("/path/to/test_dataset")
arrow_table = ds.to_arrow()

import duckdb

# Get an in-memory DuckDB database and create a new table from the result arrow table.
# Note that the python variable is automatically determined from the query string.
con = duckdb.connect()
con.execute("CREATE TABLE my_table AS SELECT * FROM arrow_table")
from foundry_dev_tools import FoundryRestClient

rest_client = FoundryRestClient()
arrow_table = rest_client.query_foundry_sql(
    "SELECT * FROM `/path/to/test_dataset`",
    branch="master",
    return_type="arrow",
)
import duckdb

# Get an in-memory DuckDB database and create a new table from the result arrow table.
# Note that the python variable is automatically determined from the query string.
con = duckdb.connect()
con.execute("CREATE TABLE my_table AS SELECT * FROM arrow_table")