As part of a machine-learning deployment project, I built a proof-of-concept where I created two simple logistic regression models for a binary classification task using R's glm
function and python's scikit-learn
. Afterwards, I converted those trained simple models to PMML
s using the pmml
function in R, and the from sklearn2pmml.pipeline import PMMLPipeline
function in Python.
Next, I opened a very simple workflow in KNIME to see if I can put those two PMML
s into action. Basically the goal of this proof-of-concept is to test if IT can score new data using the PMML
s that I simply hand over to them. This exercise must produce probabilities, just like the original logistic regressions would.
In KNIME, I read a test data of only 4 rows using CSV Reader
node, read the PMML
using the PMML Reader
node, and finally get that model to score that test data using PMML Predictor
node. The problem is that the predictions are not final probabilities that I want, but one step before that (sum of coefficients times independent variable values, called XBETA I guess?). Please see the workflow and predictions in the picture below:
To get to the final probabilities, one needs to run these numbers through the sigmoid function. So basically for the first record, instead of 2.654
, I need 1/(1+exp(-2.654)) = 0.93
. I am sure the PMML
file contains the required information to enable KNIME (or any other similar platform) to perform this sigmoid operation for me, but I failed to find it. That is where I desperately need help.
I looked into regression and general regression PMML
documentations, and my PMMLs look just fine, but I can't figure out why I am unable to get those probabilities.
Any help is highly appreciated!
Attachment 1 - Here is my test data:
age credit payfreq gmi
25 550 4 1500
27 650 4 3400
35 600 2 3200
40 680 2 4000
Attachment 2 - Here is my R-generated PMML:
<?xml version="1.0"?>
<PMML version="4.2" xmlns="http://www.dmg.org/PMML-4_2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dmg.org/PMML-4_2 http://www.dmg.org/v4-2/pmml-4-2.xsd">
<Header copyright="Copyright (c) 2018 fakici" description="Generalized Linear Regression Model">
<Extension name="user" value="fakici" extender="Rattle/PMML"/>
<Application name="Rattle/PMML" version="1.4"/>
<Timestamp>2018-10-30 17:36:39</Timestamp>
</Header>
<DataDictionary numberOfFields="5">
<DataField name="bad" optype="categorical" dataType="double"/>
<DataField name="age" optype="continuous" dataType="double"/>
<DataField name="credit" optype="continuous" dataType="double"/>
<DataField name="payfreq" optype="continuous" dataType="double"/>
<DataField name="gmi" optype="continuous" dataType="double"/>
</DataDictionary>
<GeneralRegressionModel modelName="General_Regression_Model" modelType="generalLinear" functionName="regression" algorithmName="glm" distribution="binomial" linkFunction="logit" targetReferenceCategory="1">
<MiningSchema>
<MiningField name="bad" usageType="predicted" invalidValueTreatment="returnInvalid"/>
<MiningField name="age" usageType="active" invalidValueTreatment="returnInvalid"/>
<MiningField name="credit" usageType="active" invalidValueTreatment="returnInvalid"/>
<MiningField name="payfreq" usageType="active" invalidValueTreatment="returnInvalid"/>
<MiningField name="gmi" usageType="active" invalidValueTreatment="returnInvalid"/>
</MiningSchema>
<Output>
<OutputField name="Predicted_bad" feature="predictedValue"/>
</Output>
<ParameterList>
<Parameter name="p0" label="(Intercept)"/>
<Parameter name="p1" label="age"/>
<Parameter name="p2" label="credit"/>
<Parameter name="p3" label="payfreq"/>
<Parameter name="p4" label="gmi"/>
</ParameterList>
<FactorList/>
<CovariateList>
<Predictor name="age"/>
<Predictor name="credit"/>
<Predictor name="payfreq"/>
<Predictor name="gmi"/>
</CovariateList>
<PPMatrix>
<PPCell value="1" predictorName="age" parameterName="p1"/>
<PPCell value="1" predictorName="credit" parameterName="p2"/>
<PPCell value="1" predictorName="payfreq" parameterName="p3"/>
<PPCell value="1" predictorName="gmi" parameterName="p4"/>
</PPMatrix>
<ParamMatrix>
<PCell parameterName="p0" df="1" beta="14.4782176066955"/>
<PCell parameterName="p1" df="1" beta="-0.16633241754673"/>
<PCell parameterName="p2" df="1" beta="-0.0125492006930571"/>
<PCell parameterName="p3" df="1" beta="0.422786551151072"/>
<PCell parameterName="p4" df="1" beta="-0.0005500245399861"/>
</ParamMatrix>
</GeneralRegressionModel>
</PMML>
Attachment 3 - Here is my Python-generated PMML:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PMML xmlns="http://www.dmg.org/PMML-4_2" xmlns:data="http://jpmml.org/jpmml-model/InlineTable" version="4.2">
<Header>
<Application name="JPMML-SkLearn" version="1.5.8"/>
<Timestamp>2018-10-30T22:10:32Z</Timestamp>
</Header>
<MiningBuildTask>
<Extension>PMMLPipeline(steps=[('mapper', DataFrameMapper(default=False, df_out=False,
features=[(['age', 'credit', 'payfreq', 'gmi'], [ContinuousDomain(high_value=None, invalid_value_replacement=None,
invalid_value_treatment='return_invalid', low_value=None,
missing_value_replacement=None, missing_value_treatment='as_is',
missing_values=None, outlier_treatment='as_is', with_data=True,
with_statistics=True), Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)])],
input_df=False, sparse=False)),
('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False))])</Extension>
</MiningBuildTask>
<DataDictionary>
<DataField name="bad" optype="categorical" dataType="double">
<Value value="0"/>
<Value value="1"/>
</DataField>
<DataField name="age" optype="continuous" dataType="double">
<Interval closure="closedClosed" leftMargin="20.0" rightMargin="50.0"/>
</DataField>
<DataField name="credit" optype="continuous" dataType="double">
<Interval closure="closedClosed" leftMargin="501.0" rightMargin="699.0"/>
</DataField>
<DataField name="payfreq" optype="continuous" dataType="double">
<Interval closure="closedClosed" leftMargin="2.0" rightMargin="4.0"/>
</DataField>
<DataField name="gmi" optype="continuous" dataType="double">
<Interval closure="closedClosed" leftMargin="1012.0" rightMargin="4197.0"/>
</DataField>
</DataDictionary>
<RegressionModel functionName="classification" normalizationMethod="softmax" algorithmName="glm" targetFieldName="bad">
<MiningSchema>
<MiningField name="bad" usageType="target"/>
<MiningField name="age" missingValueReplacement="35.05" missingValueTreatment="asMean"/>
<MiningField name="credit" missingValueReplacement="622.28" missingValueTreatment="asMean"/>
<MiningField name="payfreq" missingValueReplacement="2.74" missingValueTreatment="asMean"/>
<MiningField name="gmi" missingValueReplacement="3119.4" missingValueTreatment="asMean"/>
</MiningSchema>
<Output>
<OutputField name="probability(0)" optype="categorical" dataType="double" feature="probability" value="0"/>
<OutputField name="probability(1)" optype="categorical" dataType="double" feature="probability" value="1"/>
</Output>
<ModelStats>
<UnivariateStats field="age">
<Counts totalFreq="100.0" missingFreq="0.0" invalidFreq="0.0"/>
<NumericInfo minimum="20.0" maximum="50.0" mean="35.05" standardDeviation="9.365228240678386" median="40.5" interQuartileRange="18.0"/>
</UnivariateStats>
<UnivariateStats field="credit">
<Counts totalFreq="100.0" missingFreq="0.0" invalidFreq="0.0"/>
<NumericInfo minimum="501.0" maximum="699.0" mean="622.28" standardDeviation="76.1444784603585" median="662.0" interQuartileRange="150.5"/>
</UnivariateStats>
<UnivariateStats field="payfreq">
<Counts totalFreq="100.0" missingFreq="0.0" invalidFreq="0.0"/>
<NumericInfo minimum="2.0" maximum="4.0" mean="2.74" standardDeviation="0.9656086163658655" median="2.0" interQuartileRange="2.0"/>
</UnivariateStats>
<UnivariateStats field="gmi">
<Counts totalFreq="100.0" missingFreq="0.0" invalidFreq="0.0"/>
<NumericInfo minimum="1012.0" maximum="4197.0" mean="3119.4" standardDeviation="1282.4386379082625" median="4028.5" interQuartileRange="2944.0"/>
</UnivariateStats>
</ModelStats>
<RegressionTable targetCategory="1" intercept="0.9994024132088255">
<NumericPredictor name="age" coefficient="-0.1252021965856186"/>
<NumericPredictor name="credit" coefficient="-8.682780007730786E-4"/>
<NumericPredictor name="payfreq" coefficient="1.2605378393614861"/>
<NumericPredictor name="gmi" coefficient="1.4681704138387003E-4"/>
</RegressionTable>
<RegressionTable targetCategory="0" intercept="0.0"/>
</RegressionModel>
</PMML>