DataFrame.__getattr__(name)
|
Returns the Column denoted by name. |
DataFrame.__getitem__(item)
|
Returns the column as a Column. |
DataFrame.agg(*exprs)
|
Aggregate on the entire DataFrame without groups (shorthand for df.groupBy().agg()). |
DataFrame.alias(alias)
|
Returns a new DataFrame with an alias set. |
DataFrame.approxQuantile(col, probabilities, ...)
|
Calculates the approximate quantiles of numerical columns of a DataFrame. |
DataFrame.asTable()
|
Converts the DataFrame into a table_arg.TableArg object, which can be used as a table argument in a TVF(Table-Valued Function) including UDTF (User-Defined Table Function). |
DataFrame.cache()
|
Persists the DataFrame with the default storage level (MEMORY_AND_DISK_DESER). |
DataFrame.checkpoint([eager])
|
Returns a checkpointed version of this DataFrame. |
DataFrame.coalesce(numPartitions)
|
Returns a new DataFrame that has exactly numPartitions partitions. |
DataFrame.colRegex(colName)
|
Selects column based on the column name specified as a regex and returns it as Column. |
DataFrame.collect()
|
Returns all the records in the DataFrame as a list of Row. |
DataFrame.columns
|
Retrieves the names of all columns in the DataFrame as a list. |
DataFrame.corr(col1, col2[, method])
|
Calculates the correlation of two columns of a DataFrame as a double value. |
DataFrame.count()
|
Returns the number of rows in this DataFrame. |
DataFrame.cov(col1, col2)
|
Calculate the sample covariance for the given columns, specified by their names, as a double value. |
DataFrame.createGlobalTempView(name)
|
Creates a global temporary view with this DataFrame. |
DataFrame.createOrReplaceGlobalTempView(name)
|
Creates or replaces a global temporary view using the given name. |
DataFrame.createOrReplaceTempView(name)
|
Creates or replaces a local temporary view with this DataFrame. |
DataFrame.createTempView(name)
|
Creates a local temporary view with this DataFrame. |
DataFrame.crossJoin(other)
|
Returns the cartesian product with another DataFrame. |
DataFrame.crosstab(col1, col2)
|
Computes a pair-wise frequency table of the given columns. |
DataFrame.cube(*cols)
|
Create a multi-dimensional cube for the current DataFrame using the specified columns, allowing aggregations to be performed on them. |
DataFrame.describe(*cols)
|
Computes basic statistics for numeric and string columns. |
DataFrame.distinct()
|
Returns a new DataFrame containing the distinct rows in this DataFrame. |
DataFrame.drop(*cols)
|
Returns a new DataFrame without specified columns. |
DataFrame.dropDuplicates([subset])
|
Return a new DataFrame with duplicate rows removed, optionally only considering certain columns. |
DataFrame.dropDuplicatesWithinWatermark([subset])
|
Return a new DataFrame with duplicate rows removed, |
DataFrame.drop_duplicates([subset])
|
drop_duplicates() is an alias for dropDuplicates().
|
DataFrame.dropna([how, thresh, subset])
|
Returns a new DataFrame omitting rows with null or NaN values. |
DataFrame.dtypes
|
Returns all column names and their data types as a list. |
DataFrame.exceptAll(other)
|
Return a new DataFrame containing rows in this DataFrame but not in another DataFrame while preserving duplicates. |
DataFrame.executionInfo
|
Returns a ExecutionInfo object after the query was executed. |
DataFrame.exists()
|
Return a Column object for an EXISTS Subquery. |
DataFrame.explain([extended, mode])
|
Prints the (logical and physical) plans to the console for debugging purposes. |
DataFrame.fillna(value[, subset])
|
Returns a new DataFrame which null values are filled with new value. |
DataFrame.filter(condition)
|
Filters rows using the given condition. |
DataFrame.first()
|
Returns the first row as a Row. |
DataFrame.foreach(f)
|
Applies the f function to all Row of this DataFrame. |
DataFrame.foreachPartition(f)
|
Applies the f function to each partition of this DataFrame. |
DataFrame.freqItems(cols[, support])
|
Finding frequent items for columns, possibly with false positives. |
DataFrame.groupBy(*cols)
|
Groups the DataFrame by the specified columns so that aggregation can be performed on them. |
DataFrame.groupingSets(groupingSets, *cols)
|
Create multi-dimensional aggregation for the current DataFrame using the specified grouping sets, so we can run aggregation on them. |
DataFrame.head([n])
|
Returns the first n rows. |
DataFrame.hint(name, *parameters)
|
Specifies some hint on the current DataFrame. |
DataFrame.inputFiles()
|
Returns a best-effort snapshot of the files that compose this DataFrame. |
DataFrame.intersect(other)
|
Return a new DataFrame containing rows only in both this DataFrame and another DataFrame. |
DataFrame.intersectAll(other)
|
Return a new DataFrame containing rows in both this DataFrame and another DataFrame while preserving duplicates. |
DataFrame.isEmpty()
|
Checks if the DataFrame is empty and returns a boolean value. |
DataFrame.isLocal()
|
Returns True if the collect() and take() methods can be run locally (without any Spark executors). |
DataFrame.isStreaming
|
Returns True if this DataFrame contains one or more sources that continuously return data as it arrives. |
DataFrame.join(other[, on, how])
|
Joins with another DataFrame, using the given join expression. |
DataFrame.limit(num)
|
Limits the result count to the number specified. |
DataFrame.lateralJoin(other[, on, how])
|
Lateral joins with another DataFrame, using the given join expression. |
DataFrame.localCheckpoint([eager, storageLevel])
|
Returns a locally checkpointed version of this DataFrame. |
DataFrame.mapInPandas(func, schema[, ...])
|
Maps an iterator of batches in the current DataFrame using a Python native function that is performed on pandas DataFrames both as input and output, and returns the result as a DataFrame. |
DataFrame.mapInArrow(func, schema[, ...])
|
Maps an iterator of batches in the current DataFrame using a Python native function that is performed on pyarrow.RecordBatchs both as input and output, and returns the result as a DataFrame. |
DataFrame.metadataColumn(colName)
|
Selects a metadata column based on its logical column name and returns it as a Column. |
DataFrame.melt(ids, values, ...)
|
Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns set. |
DataFrame.na
|
Returns a DataFrameNaFunctions for handling missing values. |
DataFrame.observe(observation, *exprs)
|
Define (named) metrics to observe on the DataFrame. |
DataFrame.offset(num)
|
Returns a new :class: DataFrame by skipping the first n rows. |
DataFrame.orderBy(*cols, **kwargs)
|
Returns a new DataFrame sorted by the specified column(s). |
DataFrame.persist([storageLevel])
|
Sets the storage level to persist the contents of the DataFrame across operations after the first time it is computed. |
DataFrame.plot
|
Returns a plot.core.PySparkPlotAccessor for plotting functions. |
DataFrame.printSchema([level])
|
Prints out the schema in the tree format. |
DataFrame.randomSplit(weights[, seed])
|
Randomly splits this DataFrame with the provided weights. |
DataFrame.rdd
|
Returns the content as an pyspark.RDD of Row. |
DataFrame.registerTempTable(name)
|
Registers this DataFrame as a temporary table using the given name. |
DataFrame.repartition(numPartitions, *cols)
|
Returns a new DataFrame partitioned by the given partitioning expressions. |
DataFrame.repartitionByRange(numPartitions, ...)
|
Returns a new DataFrame partitioned by the given partitioning expressions. |
DataFrame.replace(to_replace[, value, subset])
|
Returns a new DataFrame replacing a value with another value. |
DataFrame.rollup(*cols)
|
Create a multi-dimensional rollup for the current DataFrame using the specified columns, allowing for aggregation on them. |
DataFrame.sameSemantics(other)
|
Returns True when the logical query plans inside both DataFrames are equal and therefore return the same results. |
DataFrame.sample([withReplacement, ...])
|
Returns a sampled subset of this DataFrame. |
DataFrame.sampleBy(col, fractions[, seed])
|
Returns a stratified sample without replacement based on the fraction given on each stratum. |
DataFrame.scalar()
|
Return a Column object for a SCALAR Subquery containing exactly one row and one column. |
DataFrame.schema
|
Returns the schema of this DataFrame as a pyspark.sql.types.StructType. |
DataFrame.select(*cols)
|
Projects a set of expressions and returns a new DataFrame. |
DataFrame.selectExpr(*expr)
|
Projects a set of SQL expressions and returns a new DataFrame. |
DataFrame.semanticHash()
|
Returns a hash code of the logical query plan against this DataFrame. |
DataFrame.show([n, truncate, vertical])
|
Prints the first n rows of the DataFrame to the console. |
DataFrame.sort(*cols, **kwargs)
|
Returns a new DataFrame sorted by the specified column(s). |
DataFrame.sortWithinPartitions(*cols, **kwargs)
|
Returns a new DataFrame with each partition sorted by the specified column(s). |
DataFrame.sparkSession
|
Returns Spark session that created this DataFrame. |
DataFrame.stat
|
Returns a DataFrameStatFunctions for statistic functions. |
DataFrame.storageLevel
|
Get the DataFrame's current storage level. |
DataFrame.subtract(other)
|
Return a new DataFrame containing rows in this DataFrame but not in another DataFrame. |
DataFrame.summary(*statistics)
|
Computes specified statistics for numeric and string columns. |
DataFrame.tail(num)
|
Returns the last num rows as a list of Row. |
DataFrame.take(num)
|
Returns the first num rows as a list of Row. |
DataFrame.to(schema)
|
Returns a new DataFrame where each row is reconciled to match the specified schema. |
DataFrame.toArrow()
|
Returns the contents of this DataFrame as PyArrow pyarrow.Table. |
DataFrame.toDF(*cols)
|
Returns a new DataFrame with new specified column names |
DataFrame.toJSON([use_unicode])
|
Converts a DataFrame into a RDD of string. |
DataFrame.toLocalIterator([prefetchPartitions])
|
Returns an iterator that contains all of the rows in this DataFrame. |
DataFrame.toPandas()
|
Returns the contents of this DataFrame as Pandas pandas.DataFrame. |
DataFrame.transform(func, *args, **kwargs)
|
Returns a new DataFrame. |
DataFrame.transpose([indexColumn])
|
Transposes a DataFrame such that the values in the specified index column become the new columns of the DataFrame. |
DataFrame.union(other)
|
Return a new DataFrame containing the union of rows in this and another DataFrame. |
DataFrame.unionAll(other)
|
Return a new DataFrame containing the union of rows in this and another DataFrame. |
DataFrame.unionByName(other[, ...])
|
Returns a new DataFrame containing union of rows in this and another DataFrame. |
DataFrame.unpersist([blocking])
|
Marks the DataFrame as non-persistent, and remove all blocks for it from memory and disk. |
DataFrame.unpivot(ids, values, ...)
|
Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns set. |
DataFrame.where(condition)
|
where() is an alias for filter().
|
DataFrame.withColumn(colName, col)
|
Returns a new DataFrame by adding a column or replacing the existing column that has the same name. |
DataFrame.withColumns(*colsMap)
|
Returns a new DataFrame by adding multiple columns or replacing the existing columns that have the same names. |
DataFrame.withColumnRenamed(existing, new)
|
Returns a new DataFrame by renaming an existing column. |
DataFrame.withColumnsRenamed(colsMap)
|
Returns a new DataFrame by renaming multiple columns. |
DataFrame.withMetadata(columnName, metadata)
|
Returns a new DataFrame by updating an existing column with metadata. |
DataFrame.withWatermark(eventTime, ...)
|
Defines an event time watermark for this DataFrame. |
DataFrame.write
|
Interface for saving the content of the non-streaming DataFrame out into external storage. |
DataFrame.writeStream
|
Interface for saving the content of the streaming DataFrame out into external storage. |
DataFrame.writeTo(table)
|
Create a write configuration builder for v2 sources. |
DataFrame.mergeInto(table, condition)
|
Merges a set of updates, insertions, and deletions based on a source table into a target table. |
DataFrame.pandas_api([index_col])
|
Converts the existing DataFrame into a pandas-on-Spark DataFrame. |
DataFrameNaFunctions.drop([how, thresh, subset])
|
Returns a new DataFrame omitting rows with null or NaN values. |
DataFrameNaFunctions.fill(value[, subset])
|
Returns a new DataFrame which null values are filled with new value. |
DataFrameNaFunctions.replace(to_replace[, ...])
|
Returns a new DataFrame replacing a value with another value. |
DataFrameStatFunctions.approxQuantile(col, ...)
|
Calculates the approximate quantiles of numerical columns of a DataFrame. |
DataFrameStatFunctions.corr(col1, col2[, method])
|
Calculates the correlation of two columns of a DataFrame as a double value. |
DataFrameStatFunctions.cov(col1, col2)
|
Calculate the sample covariance for the given columns, specified by their names, as a double value. |
DataFrameStatFunctions.crosstab(col1, col2)
|
Computes a pair-wise frequency table of the given columns. |
DataFrameStatFunctions.freqItems(cols[, support])
|
Finding frequent items for columns, possibly with false positives. |
DataFrameStatFunctions.sampleBy(col, fractions)
|
Returns a stratified sample without replacement based on the fraction given on each stratum. |