Dataset#
Info#
The dataset class is an object-oriented way to use the Foundry DevTools API clients.
All methods which require a transaction to work will create one automatically and commit it automatically.
This works via the transaction_context context manager (click the link to see the documentation for it).
Examples:
with ds.transaction_context():
# will start the transaction
print(ds.transaction) # will print the dictionary of the transaction object
ds.put_file(...)
ds.remove_file(...)
# will commit the transaction unless an error happened
print(ds.transaction) # will throw an error as there is currently no open transaction
You can also chain multiple actions together.
Only works with methods that return the dataset, e.g. list_files obviously does not return the Dataset class but the list of files and so on.
ds = ctx.get_dataset(...)
ds.start_transaction().put_file(...).upload_schema(...).commit_transaction()
And if you create a transaction manually before using the context it won’t do anything.
ds.start_transaction()
with ds.transaction_context():
# will not start a new transaction
print(ds.transaction) # will print the transaction started earlier
# will not commit or abort the transaction
print(ds.transaction) # still accessible and open
# you'll need to close it manually
ds.commit_transaction()
The state/attributes of resources like the Dataset class may get out of “sync” if you have multiple instances for the same dataset, or modify the dataset in other ways than through the Dataset object.
ds.sync()
Uploading a dataset to Foundry#
Saves a Pandas or PySpark dataframe to Foundry.
from foundry_dev_tools import FoundryContext
import pandas as pd
df = pd.DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/test_output_dataset", create_if_not_exist=True)
dataset.save_dataframe(df)
from foundry_dev_tools import CachedFoundryClient
cached_client = CachedFoundryClient()
cached_client.save_dataset(df, '/path/to/test_output_dataset',
branch='master', exists_ok=True, mode='SNAPSHOT')
Uploading a folder to a dataset in Foundry#
Upload the complete content of a local folder to a dataset in Foundry
from foundry_dev_tools import FoundryContext
from pathlib import Path
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/test_folder_upload", create_if_not_exist=True)
dataset.upload_folder(Path("/path/to/folder-to-upload"))
import os
from foundry_dev_tools import FoundryRestClient
upload_folder = "/path/to/folder-to-upload"
target_dataset_path = "/path/to/test_folder_upload"
file_paths = [file for file in Path(upload_folder).rglob("*") if file.is_file() and not file.name.startswith(".")]
dataset_paths_in_foundry = [str(file_path.relative_to(upload_folder)) for file_path in file_paths]
path_file_dict = dict(zip(dataset_paths_in_foundry, file_paths))
rest_client = FoundryRestClient()
dataset_rid = rest_client.get_dataset_rid(dataset_path=target_dataset_path)
transaction_rid = rest_client.open_transaction(dataset_rid=dataset_rid,
mode='UPDATE',
branch='master')
rest_client.upload_dataset_files(dataset_rid=dataset_rid,
transaction_rid=transaction_rid,
path_file_dict=path_file_dict)
rest_client.commit_transaction(dataset_rid, transaction_rid)
Save model or other type of python object#
import pickle
from foundry_dev_tools import FoundryContext
model_obj = """<PMML xmlns="http://www.dmg.org/PMML-4_1" version="4.1"></PMML>""" # can be any python object that can be pickled
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1", create_if_not_exist=True)
pickled_model = pickle.dumps(model_obj)
dataset.put_file("model.pickle", file_data=pickled_model)
from foundry_dev_tools import CachedFoundryClient
import pickle
model_obj = """<PMML xmlns="http://www.dmg.org/PMML-4_1" version="4.1"></PMML>"""
cached_client = CachedFoundryClient()
cached_client.save_model(model_obj, dataset_path_or_rid='/path/to/playground/model1',
branch='master', exists_ok=True, mode='SNAPSHOT')
Load model or other type of file (using temporary file)#
from foundry_dev_tools import FoundryContext
from pathlib import Path
import pickle
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1")
model_file = dataset.download_file(output_directory=Path("/tmp/model"),path_in_dataset="model.pickle")
with model_file.open("rb") as model:
print(pickle.load(model))
from foundry_dev_tools import FoundryRestClient
import pickle
rest_client = FoundryRestClient()
rid = rest_client.get_dataset_rid('/path/to/playground/model1')
model_file = rest_client.download_dataset_files(dataset_rid=rid, output_directory='/tmp/model', branch='master')[0]
with open(model_file, 'rb') as file:
print(pickle.load(file))
Load model or other type of file (in-memory)#
from foundry_dev_tools import FoundryContext
import pickle
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1")
model_file_bytes = dataset.get_file("model.pickle")
print(pickle.loads(model_file_bytes))
from foundry_dev_tools import FoundryRestClient
import pickle
rest_client = FoundryRestClient()
rid = rest_client.get_dataset_rid('/path/to/playground/model1')
model_file_bytes = rest_client.download_dataset_file(dataset_rid=rid,
output_directory=None,
foundry_file_path='model.pickle',
view='master')
print(pickle.loads(model_file_bytes))
Download a dataset to a temporary folder#
Downloads to a temporary folder and reading parquet dataset with pandas/pyarrow. When exiting the context, the temp files are automatically deleted.
from foundry_dev_tools import FoundryContext
import pandas as pd
ctx = FoundryContext()
dataset = ctx.get_dataset("ri.foundry.main.dataset...")
with dataset.download_files_temporary() as tmp_dir:
df = pd.read_parquet(tmp_dir)
print(df.shape)
from foundry_dev_tools import FoundryRestClient
import pandas as pd
rest_client = FoundryRestClient()
rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xx-xxxxxxxxxx"
with rest_client.download_dataset_files_temporary(dataset_rid=rid, view='master') as temp_folder:
df = pd.read_parquet(temp_folder)
print(df.shape)
Download only few files from dataset#
You can simply specify the list of files you want to download in download_dataset_files
rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xxx-xxxxxxxxx"
ds = ctx.get_dataset(rid)
ds.download_files(output_directory=Path("/path/to/only_few_files"),paths_in_dataset={"file1.png","file2.png"})
rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xxx-xxxxxxxxx"
rest_client.download_dataset_files(dataset_rid=rid, output_directory='/paht/to/only_few_files', files=['file1.png', 'file2.png'], branch='master')
Polars#
There are three ways to get Polars data from a Foundry dataset. Choose the one that fits your workload:
Method |
Data path |
Evaluation |
Best for |
|---|---|---|---|
|
FoundrySqlServer |
Eager (full dataset) |
Quick exploration of small-medium datasets |
|
FoundrySqlServer |
Eager (SQL-filtered) |
Aggregations, joins, complex SQL queries |
|
Direct parquet scan (S3) |
Lazy |
Filtering/selection on large datasets; portable code for Foundry transforms |
to_lazy_polars() scans parquet files directly via the S3-compatible API using polars.scan_parquet. Combined with Polars’ lazy evaluation, this enables predicate pushdown: filters applied to the LazyFrame are pushed down to the parquet reader, so only relevant data is read from storage.
The lazy Polars API uses the same syntax as Foundry lightweight transforms, so code written with to_lazy_polars() can be moved into a Foundry transform without rewriting.
Eager via FoundrySqlServer#
from foundry_dev_tools import FoundryContext
import polars as pl
ctx = FoundryContext()
ds = ctx.get_dataset_by_path("/path/to/test_dataset")
# Fetch the full dataset
df = ds.to_polars()
print(df)
# Or use SQL to filter/aggregate server-side
df = ds.query_foundry_sql("SELECT * WHERE age > 25", return_type="polars")
print(df)
from foundry_dev_tools import FoundryRestClient
rest_client = FoundryRestClient()
arrow_table = rest_client.query_foundry_sql(
"SELECT * FROM `/path/to/test_dataset`",
branch="master",
return_type="arrow",
)
import polars as pl
df = pl.from_arrow(arrow_table)
print(df)
Lazy via direct S3 parquet scan#
from foundry_dev_tools import FoundryContext
import polars as pl
ctx = FoundryContext()
ds = ctx.get_dataset_by_path("/path/to/test_dataset")
lazy_df: pl.LazyFrame = ds.to_lazy_polars()
# Perform lazy operations (not executed yet)
result = lazy_df.filter(pl.col("age") > 25).select("name", "age")
# Execute and collect results
df = result.collect()
print(df)
DuckDB Table from Spark SQL dialect#
Queries the Foundry SQL server with Spark SQL dialect, load arrow stream using duckdb.
from foundry_dev_tools import FoundryContext
ctx = FoundryContext()
ds = ctx.get_dataset_by_path("/path/to/test_dataset")
arrow_table = ds.to_arrow()
import duckdb
# Get an in-memory DuckDB database and create a new table from the result arrow table.
# Note that the python variable is automatically determined from the query string.
con = duckdb.connect()
con.execute("CREATE TABLE my_table AS SELECT * FROM arrow_table")
from foundry_dev_tools import FoundryRestClient
rest_client = FoundryRestClient()
arrow_table = rest_client.query_foundry_sql(
"SELECT * FROM `/path/to/test_dataset`",
branch="master",
return_type="arrow",
)
import duckdb
# Get an in-memory DuckDB database and create a new table from the result arrow table.
# Note that the python variable is automatically determined from the query string.
con = duckdb.connect()
con.execute("CREATE TABLE my_table AS SELECT * FROM arrow_table")