Put this utility method somewhere in your code to produce a formatted string with the dataframe.show() format.
Then just include it in your logging output like:
log.info("at this point the dataframe named df shows as \n"+showString(df,100,-40))
/**
* Compose the string representing rows for output
*
* @param _numRows Number of rows to show
* @param truncate If set to more than 0, truncates strings to `truncate` characters and
* all cells will be aligned right.
*/
def showString(
df:DataFrame
,_numRows: Int = 20
,truncateWidth: Int = 20
): String = {
val numRows = _numRows.max(0)
val takeResult = df.take(numRows + 1)
val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows)
// For array values, replace Seq and Array with square brackets
// For cells that are beyond `truncate` characters, replace it with the
// first `truncate-3` and "..."
val rows: Seq[Seq[String]] = df.schema.fieldNames.toSeq +: data.map { row =>
row.toSeq.map { cell =>
val str = cell match {
case null => "null"
case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
case array: Array[_] => array.mkString("[", ", ", "]")
case seq: Seq[_] => seq.mkString("[", ", ", "]")
case _ => cell.toString
}
if (truncateWidth > 0 && str.length > truncateWidth) {
// do not show ellipses for strings shorter than 4 characters.
if (truncateWidth < 4) str.substring(0, truncateWidth)
else str.substring(0, truncateWidth - 3) + "..."
} else {
str
}
}: Seq[String]
}