Linear Regression Model Fitting, Inference and Visualization

Tools To Be Used:

Feature Selection

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import numpy as npdf = pd.read_csv('nhanes_2015_2016.csv')
df.columns#Output:
Index(['SEQN', 'ALQ101', 'ALQ110', 'ALQ130', 'SMQ020', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'DMDCITZN', 'DMDEDUC2', 'DMDMARTL', 'DMDHHSIZ', 'WTINT2YR', 'SDMVPSU', 'SDMVSTRA', 'INDFMPIR', 'BPXSY1', 'BPXDI1', 'BPXSY2', 'BPXDI2', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXLEG', 'BMXARML', 'BMXARMC', 'BMXWAIST', 'HIQ210'], dtype='object')
keep = ['BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'RIAGENDR']
db = df[keep].dropna()
db.head()

Image for post

Linear Regression And Interpretation

model = sm.OLS.from_formula("BMXWAIST ~ BMXWT", data=db)
result = model.fit()
result.summary()

Image for post

Image for post

db.BMXWAIST.std()




Correlation

cc = db[["BMXWAIST", "BMXWT"]].corr()
print(cc)BMXWAIST     BMXWT
BMXWAIST  1.000000  0.891828
BMXWT     0.891828  1.000000
cc.BMXWAIST.BMXWT**2
result.fittedvalues

Image for post

db["RIAGENDRx"] = db.RIAGENDR.replace({1: "Male", 2: "Female"})
model = sm.OLS.from_formula("BMXWAIST ~ BMXWT + RIAGENDRx", data=db)
result = model.fit()
result.summary()

Image for post

db[['BMXWT', 'RIAGENDR']].corr()

Image for post

model = sm.OLS.from_formula("BMXWAIST ~ BMXWT + RIAGENDRx + BMXBMI", data=db)
result = model.fit()
result.summary()

Image for post




Visualization Of The Model

Confidence Interval, Calculation, and Characteristics

What is a confidence interval, how to calculate it and its important characteristics

towardsdatascience.com

 
from statsmodels.sandbox.predict_functional import predict_functional
values = {"RIAGENDRx": "Female", "RIAGENDR": 1, "BMXBMI": 25}pr, cb, fv = predict_functional(result, "BMXWT",
                values=values, ci_method="simultaneous")#Here, pr is the predicted values(pr), cb is the confidence band and #the fv is the function valuesax = sns.lineplot(fv, pr, lw=4)
ax.fill_between(fv, cb[:, 0], cb[:, 1], color='grey', alpha=0.4)
ax.set_xlabel("BMXWT")
_ = ax.set_ylabel("BMXWAIST")

Image for post

del values["BMXBMI"] # Delete this as it is now the focus variable
#del values['BMXWT']
values["BMXWT"] = 65
pr, cb, fv = predict_functional(result, "BMXBMI",
                values=values, ci_method="simultaneous")ax = sns.lineplot(fv, pr, lw=4)
ax.fill_between(fv, cb[:, 0], cb[:, 1], color='grey', alpha=0.4)
ax.set_xlabel("BMI")
_ = ax.set_ylabel("BMXWAIST")

Image for post

pp = sns.scatterplot(result.fittedvalues, result.resid)
pp.set_xlabel("Fitted values")
_ = pp.set_ylabel("Residuals")

Image for post

from statsmodels.graphics.regressionplots import plot_ccprax = plt.axes()
plot_ccpr(result, "BMXWT", ax)
ax.lines[0].set_alpha(0.2) # Reduce overplotting with transparency
_ = ax.lines[1].set_color('orange')

Image for post

ax = plt.axes()
plot_ccpr(result, "BMXBMI", ax)
ax.lines[0].set_alpha(0.2)
ax.lines[1].set_color("orange")

Image for post

 

#statistics, #linearRegression, #DataScience, #DataAnalysis, #StatisticalInference, #Python, #Statsmodels 

Leave a Reply

Close Menu