Node

flypipe.node.node

Nodes are the fundamental building block of Flypipe. Simply apply the node function as a decorator to a transformation function in order to declare the transformation as a Flypipe node.

Parameters:

Name	Type	Description	Default
`type`	`str`	Type of the node transformation "pandas", "pandas_on_spark", "pyspark", "spark_sql"	required
`description`	`(str, optional)`	Description of the node. Defaults to `None`.	required
`group`	`(str, optional)`	Group the node falls under, nodes in the same group are clustered together in the Catalog UI. Defaults to `None`.	required
`tags`	`(List[str], optional)`	List of tags for the node. Defaults to `None`.	required
`dependencies`	`(List[Node], optional)`	List of other dependent nodes. Defaults to `None`.	required
`output`	`(Schema, optional)`	Defines the output schema of the node. Defaults to `None`.	required
`spark_context`	`(bool, optional)`	True, returns spark context as argument to the function. Defaults to `False`.	required

Examples

# Syntax
@node(
    type="pyspark", "pandas_on_spark" or "pandas",
    description="this is a description of what this node does",
    tags=["a", "list", "of", "tags"],
    dependencies=[other_node_1, other_node_2, ...],
    output=Schema(
        Column("col_name", String(), "a description of the column"),
    ),
    spark_context = True or False
)
def your_function_name(other_node_1, other_node_2, ...):
    # YOUR TRANSFORMATION LOGIC HERE
    # use pandas syntax if type is `pandas` or `pandas_on_spark`
    # use PySpark syntax if type is `pyspark`
    return dataframe

# Node without dependency
from flypipe.node import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import String
import pandas as pd
@node(
    type="pandas",
    description="Only outputs a pandas dataframe",
    output=Schema(
        t0.output.get("fruit"),
        Column("flavour", String(), "fruit flavour")
    )
)
def t1(df):
    return pd.DataFrame({"fruit": ["mango"], "flavour": ["sweet"]})

# Node with dependency
from flypipe.node import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import String
import pandas as pd
@node(
    type="pandas",
    description="Only outputs a pandas dataframe",
    dependencies = [
        t0.select("fruit").alias("df")
    ],
    output=Schema(
        t0.output.get("fruit"),
        Column("flavour", String(), "fruit flavour")
    )
)
def t1(df):
    categories = {'mango': 'sweet', 'lemon': 'citric'}
    df['flavour'] = df['fruit']
    df = df.replace({'flavour': categories})
    return df

Source code in flypipe/node.py

def node(type, *args, **kwargs):
    """Nodes are the fundamental building block of Flypipe. Simply apply the node function as a decorator to a
    transformation function in order to declare the transformation as a Flypipe node.

    Parameters:
        type (str): Type of the node transformation "pandas", "pandas_on_spark", "pyspark", "spark_sql"
        description (str,optional): Description of the node. Defaults to `None`.
        group (str,optional): Group the node falls under, nodes in the same group are clustered together in the Catalog UI. Defaults to `None`.
        tags (List[str],optional): List of tags for the node. Defaults to `None`.
        dependencies (List[Node],optional): List of other dependent nodes. Defaults to `None`.
        output (Schema,optional): Defines the output schema of the node. Defaults to `None`.
        spark_context (bool,optional): True, returns spark context as argument to the function. Defaults to `False`.

    Examples

    ``` py
    # Syntax
    @node(
        type="pyspark", "pandas_on_spark" or "pandas",
        description="this is a description of what this node does",
        tags=["a", "list", "of", "tags"],
        dependencies=[other_node_1, other_node_2, ...],
        output=Schema(
            Column("col_name", String(), "a description of the column"),
        ),
        spark_context = True or False
    )
    def your_function_name(other_node_1, other_node_2, ...):
        # YOUR TRANSFORMATION LOGIC HERE
        # use pandas syntax if type is `pandas` or `pandas_on_spark`
        # use PySpark syntax if type is `pyspark`
        return dataframe
    ```

    ``` py
    # Node without dependency
    from flypipe.node import node
    from flypipe.schema import Schema, Column
    from flypipe.schema.types import String
    import pandas as pd
    @node(
        type="pandas",
        description="Only outputs a pandas dataframe",
        output=Schema(
            t0.output.get("fruit"),
            Column("flavour", String(), "fruit flavour")
        )
    )
    def t1(df):
        return pd.DataFrame({"fruit": ["mango"], "flavour": ["sweet"]})
    ```

    ``` py
    # Node with dependency
    from flypipe.node import node
    from flypipe.schema import Schema, Column
    from flypipe.schema.types import String
    import pandas as pd
    @node(
        type="pandas",
        description="Only outputs a pandas dataframe",
        dependencies = [
            t0.select("fruit").alias("df")
        ],
        output=Schema(
            t0.output.get("fruit"),
            Column("flavour", String(), "fruit flavour")
        )
    )
    def t1(df):
        categories = {'mango': 'sweet', 'lemon': 'citric'}
        df['flavour'] = df['fruit']
        df = df.replace({'flavour': categories})
        return df
    ```

    """

    def decorator(func):
        kwargs["type"] = type
        return Node(func, *args, **kwargs)

    return decorator