Dataset#

Info#

The dataset class is an object-oriented way to use the Foundry DevTools API clients.

All methods which require a transaction to work will create one automatically and commit it automatically. This works via the transaction_context context manager (click the link to see the documentation for it).

Examples:

with ds.transaction_context():
    # will start the transaction
    print(ds.transaction) # will print the dictionary of the transaction object
    ds.put_file(...)
    ds.remove_file(...)
    # will commit the transaction unless an error happened

print(ds.transaction) # will throw an error as there is currently no open transaction

You can also chain multiple actions together. Only works with methods that return the dataset, e.g. list_files obviously does not return the Dataset class but the list of files and so on.

ds = ctx.get_dataset(...)
ds.start_transaction().put_file(...).upload_schema(...).commit_transaction()

And if you create a transaction manually before using the context it won’t do anything.

ds.start_transaction()
with ds.transaction_context():
    # will not start a new transaction
    print(ds.transaction) # will print the transaction started earlier
    # will not commit or abort the transaction

print(ds.transaction) # still accessible and open

# you'll need to close it manually 
ds.commit_transaction()

The state/attributes of resources like the Dataset class may get out of “sync” if you have multiple instances for the same dataset, or modify the dataset in other ways than through the Dataset object.

ds.sync()

Uploading a dataset to Foundry#

Saves a Pandas or PySpark dataframe to Foundry.

from foundry_dev_tools import FoundryContext
import pandas as pd

df = pd.DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/test_output_dataset", create_if_not_exist=True)
dataset.save_dataframe(df)
from foundry_dev_tools import CachedFoundryClient

cached_client = CachedFoundryClient()
cached_client.save_dataset(df, '/path/to/test_output_dataset',
                           branch='master', exists_ok=True, mode='SNAPSHOT')

Uploading a folder to a dataset in Foundry#

Upload the complete content of a local folder to a dataset in Foundry

from foundry_dev_tools import FoundryContext
from pathlib import Path

ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/test_folder_upload", create_if_not_exist=True)
dataset.upload_folder(Path("/path/to/folder-to-upload"))
import os
from foundry_dev_tools import FoundryRestClient

upload_folder = "/path/to/folder-to-upload"
target_dataset_path = "/path/to/test_folder_upload"

file_paths = [file for file in Path(upload_folder).rglob("*") if file.is_file() and not file.name.startswith(".")]
dataset_paths_in_foundry = [str(file_path.relative_to(upload_folder)) for file_path in file_paths]
path_file_dict = dict(zip(dataset_paths_in_foundry, file_paths))

rest_client = FoundryRestClient()
dataset_rid = rest_client.get_dataset_rid(dataset_path=target_dataset_path)
transaction_rid = rest_client.open_transaction(dataset_rid=dataset_rid,
                                               mode='UPDATE',
                                               branch='master')
rest_client.upload_dataset_files(dataset_rid=dataset_rid,
                                 transaction_rid=transaction_rid,
                                 path_file_dict=path_file_dict)
rest_client.commit_transaction(dataset_rid, transaction_rid)

Save model or other type of python object#

import pickle

from foundry_dev_tools import FoundryContext

model_obj = """<PMML xmlns="http://www.dmg.org/PMML-4_1" version="4.1"></PMML>"""  # can be any python object that can be pickled
ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1", create_if_not_exist=True)
pickled_model = pickle.dumps(model_obj)
dataset.put_file("model.pickle", file_data=pickled_model)

from foundry_dev_tools import CachedFoundryClient
import pickle

model_obj = """<PMML xmlns="http://www.dmg.org/PMML-4_1" version="4.1"></PMML>"""
cached_client = CachedFoundryClient()
cached_client.save_model(model_obj, dataset_path_or_rid='/path/to/playground/model1',
                         branch='master', exists_ok=True, mode='SNAPSHOT')

Load model or other type of file (using temporary file)#

from foundry_dev_tools import FoundryContext
from pathlib import Path
import pickle

ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1")
model_file = dataset.download_file(output_directory=Path("/tmp/model"),path_in_dataset="model.pickle")

with model_file.open("rb") as model:
    print(pickle.load(model))

from foundry_dev_tools import FoundryRestClient
import pickle

rest_client = FoundryRestClient()
rid = rest_client.get_dataset_rid('/path/to/playground/model1')
model_file = rest_client.download_dataset_files(dataset_rid=rid, output_directory='/tmp/model', branch='master')[0]

with open(model_file, 'rb') as file:
    print(pickle.load(file))

Load model or other type of file (in-memory)#

from foundry_dev_tools import FoundryContext
import pickle

ctx = FoundryContext()
dataset = ctx.get_dataset_by_path("/path/to/playground/model1")
model_file_bytes = dataset.get_file("model.pickle")
print(pickle.loads(model_file_bytes))

from foundry_dev_tools import FoundryRestClient
import pickle

rest_client = FoundryRestClient()
rid = rest_client.get_dataset_rid('/path/to/playground/model1')
model_file_bytes = rest_client.download_dataset_file(dataset_rid=rid,
                                                     output_directory=None,
                                                     foundry_file_path='model.pickle',
                                                     view='master')
print(pickle.loads(model_file_bytes))

Download a dataset to a temporary folder#

Downloads to a temporary folder and reading parquet dataset with pandas/pyarrow. When exiting the context, the temp files are automatically deleted.

from foundry_dev_tools import FoundryContext
import pandas as pd

ctx = FoundryContext()
dataset = ctx.get_dataset("ri.foundry.main.dataset...")
with dataset.download_files_temporary() as tmp_dir:
    df = pd.read_parquet(tmp_dir)

print(df.shape)
from foundry_dev_tools import FoundryRestClient
import pandas as pd

rest_client = FoundryRestClient()
rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xx-xxxxxxxxxx"
with rest_client.download_dataset_files_temporary(dataset_rid=rid, view='master') as temp_folder:
    df = pd.read_parquet(temp_folder)

print(df.shape)

Download only few files from dataset#

You can simply specify the list of files you want to download in download_dataset_files

rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xxx-xxxxxxxxx"
ds = ctx.get_dataset(rid)
ds.download_files(output_directory=Path("/path/to/only_few_files"),paths_in_dataset={"file1.png","file2.png"})
rid = "ri.foundry.main.dataset.xxxxxxx-xxxx-xxx-xxx-xxxxxxxxx"
rest_client.download_dataset_files(dataset_rid=rid, output_directory='/paht/to/only_few_files', files=['file1.png', 'file2.png'], branch='master')

Polars DataFrame from Spark SQL dialect#

Queries the Foundry SQL server with Spark SQL dialect, load arrow stream using polars.

from foundry_dev_tools import FoundryContext
import polars as pl

ctx = FoundryContext()
ds = ctx.get_dataset_by_path("/path/to/test_dataset")
arrow_table = ds.query_foundry_sql("SELECT *",return_type="arrow")

df = pl.from_arrow(arrow_table)
print(df)
from foundry_dev_tools import FoundryRestClient

rest_client = FoundryRestClient()
arrow_table = rest_client.query_foundry_sql(
    "SELECT * FROM `/path/to/test_dataset`",
    branch="master",
    return_type="arrow",
)
import polars as pl

df = pl.from_arrow(arrow_table)
print(df)

DuckDB Table from Spark SQL dialect#

Queries the Foundry SQL server with Spark SQL dialect, load arrow stream using duckdb.

from foundry_dev_tools import FoundryContext

ctx = FoundryContext()
ds = ctx.get_dataset_by_path("/path/to/test_dataset")
arrow_table = ds.to_arrow()

import duckdb

# Get an in-memory DuckDB database and create a new table from the result arrow table.
# Note that the python variable is automatically determined from the query string.
con = duckdb.connect()
con.execute("CREATE TABLE my_table AS SELECT * FROM arrow_table")
from foundry_dev_tools import FoundryRestClient

rest_client = FoundryRestClient()
arrow_table = rest_client.query_foundry_sql(
    "SELECT * FROM `/path/to/test_dataset`",
    branch="master",
    return_type="arrow",
)
import duckdb

# Get an in-memory DuckDB database and create a new table from the result arrow table.
# Note that the python variable is automatically determined from the query string.
con = duckdb.connect()
con.execute("CREATE TABLE my_table AS SELECT * FROM arrow_table")