Node Function Examples¶

A node function allows you to create nodes programmatically.

It is useful when you need nodes to behave in different ways, depending on specific conditions.

In [1]:

Copied!





df = spark.createDataFrame(
    data=[
        ("LEMON", "Yellow",), 
        ("LIME", "Green",)
    ], schema=["fruit", "color"])

df.createOrReplaceTempView("fruits_table")

display(df)
df = spark.createDataFrame(
    data=[
        ("LEMON", "Yellow",), 
        ("LIME", "Green",)
    ], schema=["fruit", "color"])

df.createOrReplaceTempView("fruits_table")

display(df)

fruit	color
LEMON	Yellow
LIME	Green

Dynamic transformations¶

Suppose you want to lower case the columns of the dataframe above, however this dataframe can contain hundreds of columns, therefore, you only want to apply transformations on columns requested by the graph.

In this example, if a node is importing this dataframe and selecting only column fruit, then only the raw column fruit should be queried and applied lower case transformation.

In [2]:

Copied!





from flypipe import node
from flypipe import node_function
from flypipe.datasource.spark import Spark
from flypipe.schema import Schema, Column
from flypipe.schema.types import String
import pyspark.sql.functions as F

@node_function(
    requested_columns=True,
    node_dependencies=[
        Spark("fruits_table")
    ]
)
def fruits_function(requested_columns):
    
    print(f"Raw columns queried: {requested_columns}")
    
    @node(
        type="pyspark",
        dependencies=[
            Spark("fruits_table").select(requested_columns)
        ],
        output=Schema([
         Column(col, String(), col) for col in requested_columns   
        ])
    )
    def lower(fruits_table):
        for col in requested_columns:
            print(f"lower case column `{col}`")
            fruits_table = fruits_table.withColumn(col, F.lower(col))
        
        return fruits_table
    
    return lower
from flypipe import node
from flypipe import node_function
from flypipe.datasource.spark import Spark
from flypipe.schema import Schema, Column
from flypipe.schema.types import String
import pyspark.sql.functions as F

@node_function(
    requested_columns=True,
    node_dependencies=[
        Spark("fruits_table")
    ]
)
def fruits_function(requested_columns):
    
    print(f"Raw columns queried: {requested_columns}")
    
    @node(
        type="pyspark",
        dependencies=[
            Spark("fruits_table").select(requested_columns)
        ],
        output=Schema([
         Column(col, String(), col) for col in requested_columns   
        ])
    )
    def lower(fruits_table):
        for col in requested_columns:
            print(f"lower case column `{col}`")
            fruits_table = fruits_table.withColumn(col, F.lower(col))
        
        return fruits_table
    
    return lower

Selecting `fruit`¶

In [3]:

Copied!





@node(
    type="pyspark",
    dependencies=[fruits_function.select("fruit")]
)
def my_fruits(fruits_function):
    return fruits_function

df = my_fruits.run(spark)
display(df)              

displayHTML(my_fruits.html())
@node(
    type="pyspark",
    dependencies=[fruits_function.select("fruit")]
)
def my_fruits(fruits_function):
    return fruits_function

df = my_fruits.run(spark)
display(df)              

displayHTML(my_fruits.html())

Raw columns queried: ['fruit']
lower case column `fruit`

fruit
lemon
lime

Raw columns queried: ['fruit']

Out[3]:

Selecting `fruit` and `flavour`¶

In [4]:

Copied!





@node(
    type="pyspark",
    dependencies=[fruits_function.select("fruit",  "color")]
)
def my_fruits(fruits_function):
    return fruits_function

df = my_fruits.run(spark)
display(df)   
           
displayHTML(my_fruits.html())
@node(
    type="pyspark",
    dependencies=[fruits_function.select("fruit",  "color")]
)
def my_fruits(fruits_function):
    return fruits_function

df = my_fruits.run(spark)
display(df)   
           
displayHTML(my_fruits.html())

Raw columns queried: ['color', 'fruit']
lower case column `color`
lower case column `fruit`

color	fruit
yellow	lemon
green	lime

Raw columns queried: ['color', 'fruit']

Out[4]:

Expanded graph¶

Observe that the graph for both operations did not change, but the number of transformations inside node fruits changed.

We can make use of node functions to expand this operations

In [5]:

Copied!





from flypipe import node
from flypipe import node_function
from flypipe.datasource.spark import Spark
from flypipe.schema import Schema, Column
from flypipe.schema.types import String
import pyspark.sql.functions as F

@node_function(
    requested_columns=True,
    node_dependencies=[
        Spark("fruits_table")
    ]
)
def fruits_function_expanded(requested_columns):
    
    print(f"Raw columns queried: {requested_columns}")
    datasource = Spark("fruits_table").select(requested_columns)
    
    last_transformation = datasource
    for col in requested_columns:
        
        @node(
            type="pyspark",
            dependencies=[
                datasource.alias("df")
            ],
            output=Schema([
             Column(col, String(), col) for col in requested_columns   
            ])
        )
        def lower_fn(df):
            print(f"lower case column `{col}`")
            df = df.withColumn(col, F.lower(col))
            return df
        
        lower_fn.function.__name__ = f"lower_{col}"
        last_transformation = lower_fn
    
    return last_transformation
from flypipe import node
from flypipe import node_function
from flypipe.datasource.spark import Spark
from flypipe.schema import Schema, Column
from flypipe.schema.types import String
import pyspark.sql.functions as F

@node_function(
    requested_columns=True,
    node_dependencies=[
        Spark("fruits_table")
    ]
)
def fruits_function_expanded(requested_columns):
    
    print(f"Raw columns queried: {requested_columns}")
    datasource = Spark("fruits_table").select(requested_columns)
    
    last_transformation = datasource
    for col in requested_columns:
        
        @node(
            type="pyspark",
            dependencies=[
                datasource.alias("df")
            ],
            output=Schema([
             Column(col, String(), col) for col in requested_columns   
            ])
        )
        def lower_fn(df):
            print(f"lower case column `{col}`")
            df = df.withColumn(col, F.lower(col))
            return df
        
        lower_fn.function.__name__ = f"lower_{col}"
        last_transformation = lower_fn
    
    return last_transformation

Selecting `fruit` and `flavour`¶

In [6]:

Copied!





@node(
    type="pyspark",
    dependencies=[fruits_function_expanded.select("fruit",  "color").alias("df")]
)
def my_fruits_expanded(df):
    return df

df = my_fruits_expanded.run(spark)
display(df)   
           
displayHTML(my_fruits_expanded.html())        
@node(
    type="pyspark",
    dependencies=[fruits_function_expanded.select("fruit",  "color").alias("df")]
)
def my_fruits_expanded(df):
    return df

df = my_fruits_expanded.run(spark)
display(df)   
           
displayHTML(my_fruits_expanded.html())        

Raw columns queried: ['color', 'fruit']
lower case column `fruit`

color	fruit
Yellow	lemon
Green	lime

Raw columns queried: ['color', 'fruit']

Out[6]:

Node Function Examples¶

Dynamic transformations¶

Selecting fruit¶

Selecting fruit and flavour¶

Expanded graph¶

Selecting fruit and flavour¶

Selecting `fruit`¶

Selecting `fruit` and `flavour`¶

Selecting `fruit` and `flavour`¶