# -*- coding: utf-8 -*-
import pandas
import statsmodels.formula.api as smf
import seaborn
import matplotlib.pyplot as plt
data = pandas.read_csv('marscrater_pds.csv', low_memory=False)
print (len(data))
print (len(data.columns))
def latitude_categorisation_function (data):
if -100 <= data['LATITUDE_CIRCLE_IMAGE'] < -50:
return "south pole"
elif -50 <= data['LATITUDE_CIRCLE_IMAGE'] < -0:
return "south equator"
elif 0 <= data['LATITUDE_CIRCLE_IMAGE'] < 50:
return "north equator"
elif 50 <= data['LATITUDE_CIRCLE_IMAGE'] <= 100:
return "north pole"
# Define the function for latitude categorisation
def longitude_categorisation_function (data):
if -200 <= data['LONGITUDE_CIRCLE_IMAGE'] < -100:
return "1"
elif -100 <= data['LONGITUDE_CIRCLE_IMAGE'] < 0:
return "2"
elif 0 <= data['LONGITUDE_CIRCLE_IMAGE'] < 100:
return "3"
elif 100 <= data['LONGITUDE_CIRCLE_IMAGE'] <= 200:
return "4"
# Categorise the latitude
data['Latitude_areas'] = data.apply(lambda data: latitude_categorisation_function (data), axis=1)
data['Latitude_areas'] = data['Latitude_areas'].astype('category')
data['Longitude_areas'] = data.apply(lambda data: longitude_categorisation_function (data), axis=1)
data['Longitude_areas'] = data['Longitude_areas'].astype('category')
# ANOVA between latitude and number of layers
print ("ANOVA: latitude and number of layers.")
anova_model_latitude_layers = smf.ols (formula = 'NUMBER_LAYERS ~ C(Latitude_areas)', data=data)
print(anova_model_latitude_layers.fit().summary())
seaborn.factorplot(x="Latitude_areas", y="NUMBER_LAYERS", data=data)
plt.xlabel("Latitude")
plt.ylabel("Number of layers")
# Comparison of means and standard deviations
mean_latitude_layers = data.groupby("Latitude_areas")['NUMBER_LAYERS'].mean()
print(mean_latitude_layers)
std_latitude_layers = data.groupby("Latitude_areas")['NUMBER_LAYERS'].std()
print(std_latitude_layers)
print ("As also highlighted by the graph, craters near the north pole have the highest number of layers. It is investigated now the possibility that this relationship might be influenced by the crater's dimension.")
print ("A two-way ANOVA is then performed, to elucidate this possible association. The crater dimension is categorised, from smaller to greater craters.")
# Define the function for latitude categorisation
def diameter_categorisation_function (data):
if 0 <= data['DIAM_CIRCLE_IMAGE'] < 1:
return "0-1"
elif 1 <= data['DIAM_CIRCLE_IMAGE'] < 2:
return "1-2"
elif 2 <= data['DIAM_CIRCLE_IMAGE'] < 3:
return "2-3"
elif 3 <= data['DIAM_CIRCLE_IMAGE'] < 4:
return "3-4"
elif 4 <= data['DIAM_CIRCLE_IMAGE'] < 5:
return "4-5"
elif 5 <= data['DIAM_CIRCLE_IMAGE'] < 6:
return "5-6"
elif 6 <= data['DIAM_CIRCLE_IMAGE'] < 7:
return "6-7"
elif 7 <= data['DIAM_CIRCLE_IMAGE'] < 8:
return "7-8"
elif 8 <= data['DIAM_CIRCLE_IMAGE'] < 9:
return " 8-9"
elif 9 <= data['DIAM_CIRCLE_IMAGE'] < 10:
return "9-10"
elif 10 <= data['DIAM_CIRCLE_IMAGE'] < 20:
return "10-20"
elif 20 <= data['DIAM_CIRCLE_IMAGE'] < 40:
return "20-40"
elif 40 <= data['DIAM_CIRCLE_IMAGE'] < 60:
return "40-60"
elif 60 <= data['DIAM_CIRCLE_IMAGE'] < 100:
return "60-100"
elif 100 <= data['DIAM_CIRCLE_IMAGE'] <= 1165:
return "100-1165"
# Categorise the crater diameter
data['Crater_size_category'] = data.apply(lambda data: diameter_categorisation_function (data), axis=1)
data['Crater_size_category'] = data['Crater_size_category'].astype('category')
### Two-way ANOVA
anova_model_latitude_layers_two_way_crater_size = []
for category in data['Crater_size_category'].unique():
print ("Two-way ANOVA: number of layers vs latitude for crater category size %s" %(category))
data_subset = data[data['Crater_size_category']== category]
anova_model_latitude_layers_two_way = smf.ols (formula = 'NUMBER_LAYERS ~ C(Latitude_areas)', data=data_subset)
anova_model_latitude_layers_two_way_crater_size.append(anova_model_latitude_layers_two_way)
print(anova_model_latitude_layers_two_way.fit().summary())
print ("The two-way ANOVA revealed that the relationship between the latitude and the number of layers is preserved when the crater dimension is small, while it is lost when the crater is very big in size.")
print ("At the end, this two-way ANOVA highlighted that the crater dimension plays an important role in the correlation between latitude and number of layers, and that this variable has to be taken into account to avoid drawing misleading conclusions.")
Output :
ANOVA: latitude and number of layers.
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.007
Model: OLS Adj. R-squared: 0.007
Method: Least Squares F-statistic: 918.7
Date: Mon, 27 Jun 2016 Prob (F-statistic): 0.00
Time: 09:37:58 Log-Likelihood: -87460.
No. Observations: 384343 AIC: 1.749e+05
Df Residuals: 384339 BIC: 1.750e+05
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
As also highlighted by the graph, craters near the north pole have the highest number of layers. It is investigated now the possibility that this relationship might be influenced by the crater’s dimension.
Two-way ANOVA
A two-way ANOVA is then performed, to elucidate this possible association. The crater dimension is categorised, from smaller to greater craters.
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.0703 0.001 83.229 0.000 0.069 0.072
C(Latitude_areas)[T.north pole] 0.0942 0.002 42.193 0.000 0.090 0.099
C(Latitude_areas)[T.south equator] -0.0185 0.001 -16.957 0.000 -0.021 -0.016
C(Latitude_areas)[T.south pole] -0.0141 0.002 -8.092 0.000 -0.017 -0.011
==============================================================================
Omnibus: 413227.763 Durbin-Watson: 1.509
Prob(Omnibus): 0.000 Jarque-Bera (JB): 25676301.688
Skew: 5.639 Prob(JB): 0.00
Kurtosis: 41.421 Cond. No. 5.57
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Latitude_areas
north equator 0.070320
north pole 0.164545
south equator 0.051816
south pole 0.056241
Name: NUMBER_LAYERS, dtype: float64
Latitude_areas
north equator 0.321145
north pole 0.485897
south equator 0.270035
south pole 0.270795
Name: NUMBER_LAYERS, dtype: float64
As also highlighted by the graph, craters near the north pole have the highest number of layers. It is investigated now the possibility that this relationship might be influenced by the crater's dimension.
A two-way ANOVA is then performed, to elucidate this possible association. The crater dimension is categorised, from smaller to greater craters.
Two-way ANOVA: number of layers vs latitude for crater category size 60-100
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.018
Model: OLS Adj. R-squared: 0.015
Method: Least Squares F-statistic: 6.117
Date: Mon, 27 Jun 2016 Prob (F-statistic): 0.000401
Time: 09:38:21 Log-Likelihood: -92.222
No. Observations: 1010 AIC: 192.4
Df Residuals: 1006 BIC: 212.1
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.0352 0.017 2.118 0.034 0.003 0.068
C(Latitude_areas)[T.north pole] 0.2426 0.065 3.746 0.000 0.116 0.370
C(Latitude_areas)[T.south equator] -0.0112 0.020 -0.557 0.578 -0.051 0.028
C(Latitude_areas)[T.south pole] -0.0352 0.025 -1.388 0.165 -0.085 0.015
==============================================================================
Omnibus: 1686.978 Durbin-Watson: 2.057
Prob(Omnibus): 0.000 Jarque-Bera (JB): 688671.761
Skew: 10.838 Prob(JB): 0.00
Kurtosis: 129.074 Cond. No. 9.17
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 40-60
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.005
Model: OLS Adj. R-squared: 0.003
Method: Least Squares F-statistic: 3.300
Date: Mon, 27 Jun 2016 Prob (F-statistic): 0.0196
Time: 09:38:21 Log-Likelihood: -1151.3
No. Observations: 2113 AIC: 2311.
Df Residuals: 2109 BIC: 2333.
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.0994 0.018 5.497 0.000 0.064 0.135
C(Latitude_areas)[T.north pole] 0.0117 0.065 0.180 0.857 -0.115 0.139
C(Latitude_areas)[T.south equator] -0.0311 0.022 -1.413 0.158 -0.074 0.012
C(Latitude_areas)[T.south pole] -0.0829 0.027 -3.048 0.002 -0.136 -0.030
==============================================================================
Omnibus: 2569.814 Durbin-Watson: 2.021
Prob(Omnibus): 0.000 Jarque-Bera (JB): 199459.206
Skew: 6.680 Prob(JB): 0.00
Kurtosis: 48.684 Cond. No. 8.44
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 20-40
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.022
Model: OLS Adj. R-squared: 0.022
Method: Least Squares F-statistic: 56.49
Date: Mon, 27 Jun 2016 Prob (F-statistic): 4.20e-36
Time: 09:38:21 Log-Likelihood: -8121.8
No. Observations: 7466 AIC: 1.625e+04
Df Residuals: 7462 BIC: 1.628e+04
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.3598 0.016 22.061 0.000 0.328 0.392
C(Latitude_areas)[T.north pole] 0.1660 0.054 3.069 0.002 0.060 0.272
C(Latitude_areas)[T.south equator] -0.1625 0.020 -8.194 0.000 -0.201 -0.124
C(Latitude_areas)[T.south pole] -0.2922 0.026 -11.279 0.000 -0.343 -0.241
==============================================================================
Omnibus: 5084.510 Durbin-Watson: 1.892
Prob(Omnibus): 0.000 Jarque-Bera (JB): 50669.218
Skew: 3.324 Prob(JB): 0.00
Kurtosis: 13.894 Cond. No. 7.77
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 10-20
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.032
Model: OLS Adj. R-squared: 0.031
Method: Least Squares F-statistic: 146.7
Date: Mon, 27 Jun 2016 Prob (F-statistic): 1.47e-93
Time: 09:38:21 Log-Likelihood: -15436.
No. Observations: 13487 AIC: 3.088e+04
Df Residuals: 13483 BIC: 3.091e+04
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.5440 0.012 43.974 0.000 0.520 0.568
C(Latitude_areas)[T.north pole] 0.3259 0.039 8.291 0.000 0.249 0.403
C(Latitude_areas)[T.south equator] -0.2011 0.015 -13.173 0.000 -0.231 -0.171
C(Latitude_areas)[T.south pole] -0.3151 0.021 -15.182 0.000 -0.356 -0.274
==============================================================================
Omnibus: 4780.492 Durbin-Watson: 1.933
Prob(Omnibus): 0.000 Jarque-Bera (JB): 13805.147
Skew: 1.912 Prob(JB): 0.00
Kurtosis: 6.153 Cond. No. 7.17
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 9-10
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.066
Model: OLS Adj. R-squared: 0.065
Method: Least Squares F-statistic: 64.48
Date: Mon, 27 Jun 2016 Prob (F-statistic): 2.80e-40
Time: 09:38:21 Log-Likelihood: -2647.3
No. Observations: 2735 AIC: 5303.
Df Residuals: 2731 BIC: 5326.
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.5887 0.022 26.672 0.000 0.545 0.632
C(Latitude_areas)[T.north pole] 0.4964 0.069 7.157 0.000 0.360 0.632
C(Latitude_areas)[T.south equator] -0.2278 0.028 -8.184 0.000 -0.282 -0.173
C(Latitude_areas)[T.south pole] -0.3248 0.039 -8.334 0.000 -0.401 -0.248
==============================================================================
Omnibus: 522.056 Durbin-Watson: 1.939
Prob(Omnibus): 0.000 Jarque-Bera (JB): 881.124
Skew: 1.257 Prob(JB): 4.64e-192
Kurtosis: 4.190 Cond. No. 6.75
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 8-9
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.064
Model: OLS Adj. R-squared: 0.063
Method: Least Squares F-statistic: 77.29
Date: Mon, 27 Jun 2016 Prob (F-statistic): 2.32e-48
Time: 09:38:21 Log-Likelihood: -3188.9
No. Observations: 3404 AIC: 6386.
Df Residuals: 3400 BIC: 6410.
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.5571 0.019 29.220 0.000 0.520 0.595
C(Latitude_areas)[T.north pole] 0.4679 0.060 7.858 0.000 0.351 0.585
C(Latitude_areas)[T.south equator] -0.2064 0.024 -8.587 0.000 -0.254 -0.159
C(Latitude_areas)[T.south pole] -0.3271 0.035 -9.410 0.000 -0.395 -0.259
==============================================================================
Omnibus: 687.254 Durbin-Watson: 1.969
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1219.237
Skew: 1.286 Prob(JB): 1.76e-265
Kurtosis: 4.407 Cond. No. 6.69
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 7-8
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.077
Model: OLS Adj. R-squared: 0.076
Method: Least Squares F-statistic: 117.4
Date: Mon, 27 Jun 2016 Prob (F-statistic): 4.86e-73
Time: 09:38:21 Log-Likelihood: -3697.4
No. Observations: 4238 AIC: 7403.
Df Residuals: 4234 BIC: 7428.
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.5476 0.016 34.398 0.000 0.516 0.579
C(Latitude_areas)[T.north pole] 0.5038 0.047 10.814 0.000 0.413 0.595
C(Latitude_areas)[T.south equator] -0.1986 0.020 -9.816 0.000 -0.238 -0.159
C(Latitude_areas)[T.south pole] -0.2938 0.029 -10.227 0.000 -0.350 -0.237
==============================================================================
Omnibus: 595.076 Durbin-Watson: 1.916
Prob(Omnibus): 0.000 Jarque-Bera (JB): 872.538
Skew: 1.054 Prob(JB): 3.39e-190
Kurtosis: 3.706 Cond. No. 6.25
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 6-7
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.067
Model: OLS Adj. R-squared: 0.066
Method: Least Squares F-statistic: 130.4
Date: Mon, 27 Jun 2016 Prob (F-statistic): 1.29e-81
Time: 09:38:21 Log-Likelihood: -4715.2
No. Observations: 5467 AIC: 9438.
Df Residuals: 5463 BIC: 9465.
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.4986 0.014 36.486 0.000 0.472 0.525
C(Latitude_areas)[T.north pole] 0.5595 0.039 14.206 0.000 0.482 0.637
C(Latitude_areas)[T.south equator] -0.1330 0.018 -7.573 0.000 -0.167 -0.099
C(Latitude_areas)[T.south pole] -0.2045 0.025 -8.234 0.000 -0.253 -0.156
==============================================================================
Omnibus: 657.172 Durbin-Watson: 1.958
Prob(Omnibus): 0.000 Jarque-Bera (JB): 914.473
Skew: 0.983 Prob(JB): 2.66e-199
Kurtosis: 3.384 Cond. No. 6.04
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 5-6
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.057
Model: OLS Adj. R-squared: 0.057
Method: Least Squares F-statistic: 149.3
Date: Mon, 27 Jun 2016 Prob (F-statistic): 6.65e-94
Time: 09:38:21 Log-Likelihood: -6084.7
No. Observations: 7374 AIC: 1.218e+04
Df Residuals: 7370 BIC: 1.221e+04
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.4492 0.011 39.949 0.000 0.427 0.471
C(Latitude_areas)[T.north pole] 0.4614 0.030 15.523 0.000 0.403 0.520
C(Latitude_areas)[T.south equator] -0.1156 0.015 -7.945 0.000 -0.144 -0.087
C(Latitude_areas)[T.south pole] -0.1498 0.021 -7.167 0.000 -0.191 -0.109
==============================================================================
Omnibus: 870.288 Durbin-Watson: 1.916
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1206.941
Skew: 0.976 Prob(JB): 8.24e-263
Kurtosis: 3.347 Cond. No. 5.58
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 4-5
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.043
Model: OLS Adj. R-squared: 0.043
Method: Least Squares F-statistic: 170.5
Date: Mon, 27 Jun 2016 Prob (F-statistic): 3.93e-108
Time: 09:38:21 Log-Likelihood: -8286.0
No. Observations: 11295 AIC: 1.658e+04
Df Residuals: 11291 BIC: 1.661e+04
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.3016 0.008 36.482 0.000 0.285 0.318
C(Latitude_areas)[T.north pole] 0.3731 0.021 17.603 0.000 0.332 0.415
C(Latitude_areas)[T.south equator] -0.0699 0.011 -6.508 0.000 -0.091 -0.049
C(Latitude_areas)[T.south pole] -0.1111 0.015 -7.192 0.000 -0.141 -0.081
==============================================================================
Omnibus: 3199.588 Durbin-Watson: 1.858
Prob(Omnibus): 0.000 Jarque-Bera (JB): 7339.115
Skew: 1.624 Prob(JB): 0.00
Kurtosis: 5.246 Cond. No. 5.44
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 3-4
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.031
Model: OLS Adj. R-squared: 0.031
Method: Least Squares F-statistic: 222.7
Date: Mon, 27 Jun 2016 Prob (F-statistic): 3.28e-142
Time: 09:38:21 Log-Likelihood: -12117.
No. Observations: 20962 AIC: 2.424e+04
Df Residuals: 20958 BIC: 2.427e+04
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.1803 0.005 34.648 0.000 0.170 0.190
C(Latitude_areas)[T.north pole] 0.2508 0.012 21.488 0.000 0.228 0.274
C(Latitude_areas)[T.south equator] -0.0420 0.007 -6.157 0.000 -0.055 -0.029
C(Latitude_areas)[T.south pole] -0.0075 0.010 -0.774 0.439 -0.027 0.011
==============================================================================
Omnibus: 9540.222 Durbin-Watson: 1.827
Prob(Omnibus): 0.000 Jarque-Bera (JB): 41239.822
Skew: 2.295 Prob(JB): 0.00
Kurtosis: 8.113 Cond. No. 5.01
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 2-3
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.055
Model: OLS Adj. R-squared: 0.055
Method: Least Squares F-statistic: 999.2
Date: Mon, 27 Jun 2016 Prob (F-statistic): 0.00
Time: 09:38:21 Log-Likelihood: 40147.
No. Observations: 51769 AIC: -8.029e+04
Df Residuals: 51765 BIC: -8.025e+04
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.0048 0.001 5.611 0.000 0.003 0.006
C(Latitude_areas)[T.north pole] 0.0955 0.002 49.640 0.000 0.092 0.099
C(Latitude_areas)[T.south equator] -0.0045 0.001 -4.070 0.000 -0.007 -0.002
C(Latitude_areas)[T.south pole] -0.0036 0.002 -2.184 0.029 -0.007 -0.000
==============================================================================
Omnibus: 88171.936 Durbin-Watson: 1.936
Prob(Omnibus): 0.000 Jarque-Bera (JB): 64114241.623
Skew: 12.032 Prob(JB): 0.00
Kurtosis: 173.716 Cond. No. 5.04
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 1-2
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.022
Model: OLS Adj. R-squared: 0.022
Method: Least Squares F-statistic: 1888.
Date: Mon, 27 Jun 2016 Prob (F-statistic): 0.00
Time: 09:38:22 Log-Likelihood: 4.0129e+05
No. Observations: 252719 AIC: -8.026e+05
Df Residuals: 252715 BIC: -8.025e+05
Df Model: 3
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.0008 0.000 4.921 0.000 0.000 0.001
C(Latitude_areas)[T.north pole] 0.0327 0.000 71.096 0.000 0.032 0.034
C(Latitude_areas)[T.south equator] -0.0008 0.000 -3.516 0.000 -0.001 -0.000
C(Latitude_areas)[T.south pole] 0.0004 0.000 1.168 0.243 -0.000 0.001
==============================================================================
Omnibus: 596212.894 Durbin-Watson: 1.872
Prob(Omnibus): 0.000 Jarque-Bera (JB): 4728605013.725
Skew: 24.132 Prob(JB): 0.00
Kurtosis: 671.381 Cond. No. 5.69
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Two-way ANOVA: number of layers vs latitude for crater category size 100-1165
OLS Regression Results
==============================================================================
Dep. Variable: NUMBER_LAYERS R-squared: 0.009
Model: OLS Adj. R-squared: -0.001
Method: Least Squares F-statistic: 0.8576
Date: Mon, 27 Jun 2016 Prob (F-statistic): 0.463
Time: 09:38:22 Log-Likelihood: 439.43
No. Observations: 304 AIC: -870.9
Df Residuals: 300 BIC: -856.0
Df Model: 3 ��
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
Intercept 0.0118 0.006 1.890 0.060 -0.000 0.024
C(Latitude_areas)[T.north pole] -0.0118 0.026 -0.445 0.656 -0.064 0.040
C(Latitude_areas)[T.south equator] -0.0118 0.008 -1.506 0.133 -0.027 0.004
C(Latitude_areas)[T.south pole] -0.0118 0.009 -1.249 0.212 -0.030 0.007
==============================================================================
Omnibus: 698.534 Durbin-Watson: 2.025
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1108895.362
Skew: 17.126 Prob(JB): 0.00
Kurtosis: 296.890 Cond. No. 9.29
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The two-way ANOVA revealed that the relationship between the latitude and the number of layers is preserved when the crater dimension is small, while it is lost when the crater is very big in size.
At the end, this two-way ANOVA highlighted that the crater dimension plays an important role in the correlation between latitude and number of layers, and that this variable has to be taken into account to avoid drawing misleading conclusions.
The two-way ANOVA revealed that the relationship between the latitude and the number of layers is preserved when the crater dimension is small, while it is lost when the crater is very big in size.
At the end, this two-way ANOVA highlighted that the crater dimension plays an important role in the correlation between latitude and number of layers, and that this variable has to be taken into account to avoid drawing misleading conclusions.
0 notes
Testing a Potential Moderator
Output : ANOVA: latitude and number of layers. OLS Regression Results ============================================================================== Dep. Variable: NUMBER_LAYERS R-squared: 0.007 Model: OLS Adj. R-squared: 0.007 Method: Least Squares F-statistic: 918.7 Date: Mon, 27 Jun 2016 Prob (F-statistic): 0.00 Time: 09:37:58 Log-Likelihood: -87460. No. Observations: 384343 AIC: 1.749e+05 Df Residuals: 384339 BIC: 1.750e+05 Df Model: 3 Covariance Type: nonrobust ======================================================================================================
0 notes
Assignment #3
For the purposes of this assignment; Generating a Pearson
Correlation Coefficient, I will modify my research question used in the
previous course a little bit.
Hence the question I will be looking at in this assignment is: Is there an association OR relation between Income Per Person and Life Expectancy of the people of Ghana
Hypothesis Testing
The Null and Alternate Hypotheses:
From the above research question, the Null Hypothesis (Ho)
is that there is no association /relations between Income Per Person and Life
Expectancy of the people of Ghana.
Whereas the Alternate Hypothesis (Ha) states that there is an association / relation between Income Per Person and Life Expectancy of the people of Ghana.
# -*- coding: utf-8 -*- """ Created on @author: navinkumar """
import pandas
import numpy
import seaborn
import scipy
import matplotlib.pyplot as plt
data = pandas.read_csv('gapminder_ghana_updated.csv',low_memory=False)
#setting variables you will be working with to numeric
data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)
data["lifeexpectancy"] =data["lifeexpectancy"].convert_objects(convert_numeric=True) #replacing missen values with Nan data['incomeperperson']=data['incomeperperson'].replace('',numpy.nan) data['lifeexpectancy']=data['lifeexpectancy'].replace('',numpy.nan)
scat1 = seaborn.regplot(x="incomeperperson",y="lifeexpectancy", fit_reg=True, data=data)
plt.xlabel('incomeperperson')
plt.ylabel('lifeexpectancy')
plt.title('Scatterplot for the Association Betweenincomeperperson and lifeexpectancy of Ghana')
print ('')
print ('')
print ('')
print ('association between incomeperperson andlifeexpectancy of Ghana') print (scipy.stats.pearsonr(data['incomeperperson'],data['lifeexpectancy']))
print ('')
print ('')
print ('')
<<<<<<<<<<<<<CODE OUTPUT BEGIN>>>>>>>>>>>>>>>>>>>
association between incomeperperson and lifeexpectancy of
Ghana
(0.84735157770557723, 9.6218092417241115e-61)
DRAWING CONCLUSION (SUMMARY):
From the output of the code, it can be seen that the p-value = 9.6218092417241115e-61
and it is far less than the statistically and scientifically testing value of
0.05 (or 5%). And the Pearson Correlation Coefficient, r = 0.84743.
The p-value indicates that the test is significant. This
means that we have enough evidence against the Null Hypothesis (Ho) and can
therefore reject the Null Hypothesis (Ho).
In other words, there is a relation or association between Income Per Person and Life Expectancy of the people of Ghana. And this is a strong positive relationship
because the value 0.84743 is closer to positive one (+1)
This is further demonstrated by the Scatterplot. From the Scatterplot, an increase in the incomeperperson is associated with an increase in the lifeexpectancy of the people of Ghana. This is a demonstrated by a positive linear line which moves
positively upwards.
Hence there is a strong positive relationship between Income Per Person and Life Expectancy of the people of Ghana.
Also from the outputs, it means the positive relations between Income Per Person and Life Expectancy of the people of Ghana could not have happened by mere chance
Furthermore, from the output of the test, it means when we take the Pearson Correlation Coefficient, (r) value of 0.84743 and square it, we can predict what
percentage of variability there is in the Life Expectancy of the people of Ghana.
Hence mathematically,
r = 0.84743
r2 = (0.84743)2
= 0.718
The value 0.718 means that if we know the Pearson Correlation Coefficient, (r) value of the incomeperperson of the people of Ghana, we can predict 71.8%
of the variability we see in the lifeexpectancy of the people of Ghana
0 notes
Data Analysis Tools Wesleyan University
Week 2 Assignment: Running a Chi-Square Test
Research Question: Is there any differences between how much young adults smoke monthly in West and Midwest U.S. regions?
Dataset: U.S. National Epidemiological Survey on Alcohol and Related Conditions (NESARC)
Source Code:
# -*- coding: utf-8 -*-
"""
@authorsarahdessen: Navinkumar
@Based on the Week 2 example available in Coursera
"""
import pandas
import numpy
import scipy.stats
import seaborn
import matplotlib.pyplot as plt
data = pandas.read_csv('nesarc_pds.csv', low_memory=True, na_values=' '
,usecols=["S3AQ3B1","CHECK321","AGE","REGION"]
,dtype={"CHECK321":float,"AGE":float})
# new code setting variables you will be working with to numeric
data['CHECK321'] = pandas.to_numeric(data['CHECK321'], errors='coerce')
data['AGE'] = pandas.to_numeric(data['AGE'], errors='coerce')
#subset data to young adults age 18 to 25 who have smoked in the past 12 months
sub1=data[(data['AGE']>=18) & (data['AGE']<=25) & (data['CHECK321']==1)]
#make a copy of my new subsetted data
sub2 = sub1.copy()
# recode missing values to python missing (NaN)
sub2['S3AQ3B1']=sub2['S3AQ3B1'].replace(9, numpy.nan)
#recoding values for S3AQ3B1 into a new variable, USFREQMO
recode1 = {1: 30, 2: 22, 3: 14, 4: 6, 5: 2.5, 6: 1}
sub2['USFREQMO']= sub2['S3AQ3B1'].map(recode1)
recodeReg = {4: 'West', 2:'Midwest'}
sub2['SUBREGION']= sub2['REGION'].map(recodeReg)
# set variable types
sub2["USFREQMO"] = sub2["USFREQMO"].astype('category')
###General comparison
# contingency table of observed counts
ct1=pandas.crosstab(sub2['SUBREGION'], sub2['USFREQMO'])
print (ct1)
# column percentages
colsum=ct1.sum(axis=0)
colpct=ct1/colsum
print(colpct)
# chi-square
print ('chi-square value, p value, expected counts')
cs1= scipy.stats.chi2_contingency(ct1)
print (cs1)
###Beginning the pairwise comparisons
print('\n\nBeginning Pairwise comparisons\n')
recode2 = {1: 1, 2.5: 2.5}
sub2['COMP1v2']= sub2['USFREQMO'].map(recode2)
# contingency table of observed counts
ct2=pandas.crosstab(sub2['SUBREGION'], sub2['COMP1v2'])
print (ct2)
# column percentages
colsum=ct2.sum(axis=0)
colpct=ct2/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs2= scipy.stats.chi2_contingency(ct2)
print (cs2)
recode3 = {1: 1, 6: 6}
sub2['COMP1v6']= sub2['USFREQMO'].map(recode3)
# contingency table of observed counts
ct3=pandas.crosstab(sub2['SUBREGION'], sub2['COMP1v6'])
print (ct3)
# column percentages
colsum=ct3.sum(axis=0)
colpct=ct3/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs3= scipy.stats.chi2_contingency(ct3)
print (cs3)
recode4 = {1: 1, 14: 14}
sub2['COMP1v14']= sub2['USFREQMO'].map(recode4)
# contingency table of observed counts
ct4=pandas.crosstab(sub2['SUBREGION'], sub2['COMP1v14'])
print (ct4)
# column percentages
colsum=ct4.sum(axis=0)
colpct=ct4/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs4= scipy.stats.chi2_contingency(ct4)
print (cs4)
recode5 = {1: 1, 22: 22}
sub2['COMP1v22']= sub2['USFREQMO'].map(recode5)
# contingency table of observed counts
ct5=pandas.crosstab(sub2['SUBREGION'], sub2['COMP1v22'])
print (ct5)
# column percentages
colsum=ct5.sum(axis=0)
colpct=ct5/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs5= scipy.stats.chi2_contingency(ct5)
print (cs5)
recode6 = {1: 1, 30: 30}
sub2['COMP1v30']= sub2['USFREQMO'].map(recode6)
# contingency table of observed counts
ct6=pandas.crosstab(sub2['SUBREGION'], sub2['COMP1v30'])
print (ct6)
# column percentages
colsum=ct6.sum(axis=0)
colpct=ct6/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs6= scipy.stats.chi2_contingency(ct6)
print (cs6)
recode7 = {2.5: 2.5, 6: 6}
sub2['COMP2v6']= sub2['USFREQMO'].map(recode7)
# contingency table of observed counts
ct7=pandas.crosstab(sub2['SUBREGION'], sub2['COMP2v6'])
print (ct7)
# column percentages
colsum=ct7.sum(axis=0)
colpct=ct7/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs7= scipy.stats.chi2_contingency(ct7)
print (cs7)
recode8 = {2.5: 2.5, 14: 14}
sub2['COMP2v14']= sub2['USFREQMO'].map(recode8)
# contingency table of observed counts
ct8=pandas.crosstab(sub2['SUBREGION'], sub2['COMP2v14'])
print (ct8)
# column percentages
colsum=ct8.sum(axis=0)
colpct=ct8/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs8= scipy.stats.chi2_contingency(ct8)
print (cs8)
recode9 = {2.5: 2.5, 22: 22}
sub2['COMP2v22']= sub2['USFREQMO'].map(recode9)
# contingency table of observed counts
ct9=pandas.crosstab(sub2['SUBREGION'], sub2['COMP2v22'])
print (ct9)
# column percentages
colsum=ct9.sum(axis=0)
colpct=ct9/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs9= scipy.stats.chi2_contingency(ct9)
print (cs9)
recode10 = {2.5: 2.5, 30: 30}
sub2['COMP2v30']= sub2['USFREQMO'].map(recode10)
# contingency table of observed counts
ct10=pandas.crosstab(sub2['SUBREGION'], sub2['COMP2v30'])
print (ct10)
# column percentages
colsum=ct10.sum(axis=0)
colpct=ct10/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs10= scipy.stats.chi2_contingency(ct10)
print (cs10)
recode11 = {6: 6, 14: 14}
sub2['COMP6v14']= sub2['USFREQMO'].map(recode11)
# contingency table of observed counts
ct11=pandas.crosstab(sub2['SUBREGION'], sub2['COMP6v14'])
print (ct11)
# column percentages
colsum=ct11.sum(axis=0)
colpct=ct11/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs11= scipy.stats.chi2_contingency(ct11)
print (cs11)
recode12 = {6: 6, 22: 22}
sub2['COMP6v22']= sub2['USFREQMO'].map(recode12)
# contingency table of observed counts
ct12=pandas.crosstab(sub2['SUBREGION'], sub2['COMP6v22'])
print (ct12)
# column percentages
colsum=ct12.sum(axis=0)
colpct=ct12/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs12= scipy.stats.chi2_contingency(ct12)
print (cs12)
recode13 = {6: 6, 30: 30}
sub2['COMP6v30']= sub2['USFREQMO'].map(recode13)
# contingency table of observed counts
ct13=pandas.crosstab(sub2['SUBREGION'], sub2['COMP6v30'])
print (ct13)
# column percentages
colsum=ct13.sum(axis=0)
colpct=ct13/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs13= scipy.stats.chi2_contingency(ct13)
print (cs13)
recode14 = {14: 14, 22: 22}
sub2['COMP14v22']= sub2['USFREQMO'].map(recode14)
# contingency table of observed counts
ct14=pandas.crosstab(sub2['SUBREGION'], sub2['COMP14v22'])
print (ct14)
# column percentages
colsum=ct14.sum(axis=0)
colpct=ct14/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs14= scipy.stats.chi2_contingency(ct14)
print (cs14)
recode15 = {14: 14, 30: 30}
sub2['COMP14v30']= sub2['USFREQMO'].map(recode14)
# contingency table of observed counts
ct15=pandas.crosstab(sub2['SUBREGION'], sub2['COMP14v30'])
print (ct15)
# column percentages
colsum=ct15.sum(axis=0)
colpct=ct15/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs15= scipy.stats.chi2_contingency(ct15)
print (cs15)
recode16 = {22: 22, 30: 30}
sub2['COMP22v30']= sub2['USFREQMO'].map(recode16)
# contingency table of observed counts
ct16=pandas.crosstab(sub2['SUBREGION'], sub2['COMP22v30'])
print (ct16)
# column percentages
colsum=ct16.sum(axis=0)
colpct=ct16/colsum
print(colpct)
#chi square
print ('chi-square value, p value, expected counts')
cs16= scipy.stats.chi2_contingency(ct16)
print (cs16)
print('\n\nPairwise comparisons = 15')
print('Bonferroni Adjustment = 0.05/15')
print('\nBonferroni Adjusted p value for significance < 0.008')
Results:
USFREQMO 1.0 2.5 6.0 14.0 22.0 30.0
SUBREGION
Midwest 13 16 19 23 18 358
West 26 17 29 17 20 268
USFREQMO 1.0 2.5 6.0 14.0 22.0 30.0
SUBREGION
Midwest 0.333333 0.484848 0.395833 0.575 0.473684 0.571885
West 0.666667 0.515152 0.604167 0.425 0.526316 0.428115
chi-square value, p value, expected counts
(14.54993125305942, 0.012468884171180491, 5, array([[ 21.1565534 , 17.90169903, 26.03883495, 21.69902913,
20.61407767, 339.58980583],
[ 17.8434466 , 15.09830097, 21.96116505, 18.30097087,
17.38592233, 286.41019417]]))
Beginning Pairwise comparisons
COMP1v2 1.0 2.5
SUBREGION
Midwest 13 16
West 26 17
COMP1v2 1.0 2.5
SUBREGION
Midwest 0.333333 0.484848
West 0.666667 0.515152
chi-square value, p value, expected counts
(1.1341793731529095, 0.28688562360604997, 1, array([[ 15.70833333, 13.29166667],
[ 23.29166667, 19.70833333]]))
COMP1v6 1 6
SUBREGION
Midwest 13 19
West 26 29
COMP1v6 1 6
SUBREGION
Midwest 0.333333 0.395833
West 0.666667 0.604167
chi-square value, p value, expected counts
(0.1426511964597903, 0.70565946102422616, 1, array([[ 14.34482759, 17.65517241],
[ 24.65517241, 30.34482759]]))
COMP1v14 1 14
SUBREGION
Midwest 13 23
West 26 17
COMP1v14 1 14
SUBREGION
Midwest 0.333333 0.575
West 0.666667 0.425
chi-square value, p value, expected counts
(3.7263109347048271, 0.053561567428486494, 1, array([[ 17.7721519, 18.2278481],
[ 21.2278481, 21.7721519]]))
COMP1v22 1 22
SUBREGION
Midwest 13 18
West 26 20
COMP1v22 1 22
SUBREGION
Midwest 0.333333 0.473684
West 0.666667 0.526316
chi-square value, p value, expected counts
(1.0467968355185073, 0.30624595946747202, 1, array([[ 15.7012987, 15.2987013],
[ 23.2987013, 22.7012987]]))
COMP1v30 1 30
SUBREGION
Midwest 13 358
West 26 268
COMP1v30 1 30
SUBREGION
Midwest 0.333333 0.571885
West 0.666667 0.428115
chi-square value, p value, expected counts
(7.5308403506494086, 0.0060651609867727295, 1, array([[ 21.75789474, 349.24210526],
[ 17.24210526, 276.75789474]]))
COMP2v6 2.5 6.0
SUBREGION
Midwest 16 19
West 17 29
COMP2v6 2.5 6.0
SUBREGION
Midwest 0.484848 0.395833
West 0.515152 0.604167
chi-square value, p value, expected counts
(0.3208012775268208, 0.5711265039906589, 1, array([[ 14.25925926, 20.74074074],
[ 18.74074074, 27.25925926]]))
COMP2v14 2.5 14.0
SUBREGION
Midwest 16 23
West 17 17
COMP2v14 2.5 14.0
SUBREGION
Midwest 0.484848 0.575
West 0.515152 0.425
chi-square value, p value, expected counts
(0.28386595022624428, 0.59417845915735179, 1, array([[ 17.63013699, 21.36986301],
[ 15.36986301, 18.63013699]]))
COMP2v22 2.5 22.0
SUBREGION
Midwest 16 18
West 17 20
COMP2v22 2.5 22.0
SUBREGION
Midwest 0.484848 0.473684
West 0.515152 0.526316
chi-square value, p value, expected counts
(0.02080449081223084, 0.88531283564528629, 1, array([[ 15.8028169, 18.1971831],
[ 17.1971831, 19.8028169]]))
COMP2v30 2.5 30.0
SUBREGION
Midwest 16 358
West 17 268
COMP2v30 2.5 30.0
SUBREGION
Midwest 0.484848 0.571885
West 0.515152 0.428115
chi-square value, p value, expected counts
(0.64539943520707399, 0.42176231975685008, 1, array([[ 18.72837633, 355.27162367],
[ 14.27162367, 270.72837633]]))
COMP6v14 6 14
SUBREGION
Midwest 19 23
West 29 17
COMP6v14 6 14
SUBREGION
Midwest 0.395833 0.575
West 0.604167 0.425
chi-square value, p value, expected counts
(2.1350931677018647, 0.14396171516789358, 1, array([[ 22.90909091, 19.09090909],
[ 25.09090909, 20.90909091]]))
COMP6v22 6 22
SUBREGION
Midwest 19 18
West 29 20
COMP6v22 6 22
SUBREGION
Midwest 0.395833 0.473684
West 0.604167 0.526316
chi-square value, p value, expected counts
(0.25488612941620509, 0.61365542442547072, 1, array([[ 20.65116279, 16.34883721],
[ 27.34883721, 21.65116279]]))
COMP6v30 6 30
SUBREGION
Midwest 19 358
West 29 268
COMP6v30 6 30
SUBREGION
Midwest 0.395833 0.571885
West 0.604167 0.428115
chi-square value, p value, expected counts
(4.9145434876472081, 0.026631500102652042, 1, array([[ 26.84866469, 350.15133531],
[ 21.15133531, 275.84866469]]))
COMP14v22 14 22
SUBREGION
Midwest 23 18
West 17 20
COMP14v22 14 22
SUBREGION
Midwest 0.575 0.473684
West 0.425 0.526316
chi-square value, p value, expected counts
(0.447364084238282, 0.50358937393222392, 1, array([[ 21.02564103, 19.97435897],
[ 18.97435897, 18.02564103]]))
COMP14v30 14 22
SUBREGION
Midwest 23 18
West 17 20
COMP14v30 14 22
SUBREGION
Midwest 0.575 0.473684
West 0.425 0.526316
chi-square value, p value, expected counts
(0.447364084238282, 0.50358937393222392, 1, array([[ 21.02564103, 19.97435897],
[ 18.97435897, 18.02564103]]))
COMP22v30 22 30
SUBREGION
Midwest 18 358
West 20 268
COMP22v30 22 30
SUBREGION
Midwest 0.473684 0.571885
West 0.526316 0.428115
chi-square value, p value, expected counts
(1.0352023548436733, 0.30893990838753638, 1, array([[ 21.51807229, 354.48192771],
[ 16.48192771, 271.51807229]]))
Pairwise comparisons = 15
Bonferroni Adjustment = 0.05/15
Bonferroni Adjusted p value for significance < 0.008
Model Interpretation for post hoc Chi-Square Test results:
A Chi Square test of independence revealed that among daily, young adult smokers (my sample), number of cigarettes smoked per day (collapsed into 6 ordered categories) and U.S region where the person lives, among West and Midwest (binary categorical variable) were significantly associated, X2 =14.55, 5 df, p=.01247.
Post hoc comparisons of rates of U.S region where the person lives, among West and Midwest by pairs of cigarettes per day categories revealed that the only statistically significant difference is between groups smoking cigarettes in 1 and 30 days in a month, where people in West who smoked 1 day per month has a lower rate than people in Midwest; while people in West who smoked 30 day per month has a higher rate than people in Midwest. All other comparisons were not statistically significant.
0 notes
Data Analysis Tool : Assignment
Breast Cancer Causes Internet Usage. To analysis the relationships between new breast cancer cases per 100,000 women in 2002 and internet use rates in 2010 or female employment rates in 2007, respectively.
First up comes some of the code I created before, including a summary figure for your information.
# Activate inline plotting, should be first statement
% matplotlib inline
# load packages
import pandas
import numpy
import seaborn
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf # for ANOVA
import statsmodels.stats.multicomp as multi # for post hoc test
import warnings # ignore warnings (e.g. from future, deprecation, etc.)
warnings.filterwarnings('ignore') # for layout reasons, after I read and acknowledged them all!
# read in data
data = pandas.read_csv("../gapminder.csv", low_memory=False)
# subset the data and make a copy to avoid error messages later on
sub = data[["country", "breastcancerper100th", "femaleemployrate", "internetuserate"]]
sub_data = sub.copy()
# change data types to numeric
sub_data["breastcancerper100th"] = sub_data["breastcancerper100th"].convert_objects(convert_numeric=True)
sub_data["femaleemployrate"] = sub_data["femaleemployrate"].convert_objects(convert_numeric=True)
sub_data["internetuserate"] = sub_data["internetuserate"].convert_objects(convert_numeric=True)
# remove rows with missing values (copy again)
sub2 = sub_data.dropna()
sub_data2 = sub2.copy()
# plot comprehensive pair plot of subsetted data
#semicolon hides text output of matplotlib
seaborn.pairplot(sub_data2);
In the plot above, you can see the distributions of my variables of interest as histograms on the diagonal, and scatterplots of their relationships with each other in the other fields. Most striking is the linear relationship between breast cancer and internet usage - hence the BCCIU slogan. During the evaluation of my final visualisation post, I was asked why I’m not saying that internet usage causes breast cancer, but instead state something stupid (my own words) like breast cancer causes internet usage. The reason is simple - the breast cancer data is from 2002, the internet usage data, on the other hand, is from 2010. Since I haven’t seen re-annual plants in this world yet, I don’t believe that such a backwards causation can exist, and since I don’t have data for both variables from the same year, or the opposite of what I have, I’m sticking to my topic.
In the first week of the Data Analysis course, we’re using ANOVA (analysis of variance) and Tukey’s HSD (honest significant difference) test to check for significant differences in mean values of different groups. This means that we’re comparing quantitative and categorical data - which means I need a categorical explanatory variable. Therefore, I’m splitting the breast cancer data into its quartiles (four equal sized groups).
# quartile split (use qcut function - ask for 4 groups)
print('breast cancer cases per 100,000 females - quartiles')
sub_data2['breastquart'] = pandas.qcut(sub_data2.breastcancerper100th, 4, labels=["25th", "50th";, "75th", "100th"])
sub_data2['breastquart'] = sub_data2.breastquart.astype(numpy.object) # convert to a data type smf.ols() can understand
print(sub_data2['breastquart'].value_counts(sort=False))
# "melt" the data into better (long) format for factorplot()
sub_data2.m = pandas.melt(sub_data2, id_vars=["breastquart"], value_vars=["femaleemployrate", "internetuserate"])
# plot (setting order manually to avoid weird automatic order)
seaborn.factorplot(x='breastquart', y='value', col="variable", data=sub_data2.m,
kind="box", ci=None, x_order=["25th", "50th", "75th", "100th"]);
breast cancer cases per 100,000 females - quartiles
50th 40
75th 40
25th 41
100th 41
Name: breastquart, dtype: int64
Great, four (almost) equal sized groups! And the boxplots show the linear relationship between breast cancer and internet usage, and only a hinted half-circle relation between breast cancer and female employment - just as expected.
What we want to do now is check if the means of the values differ between the different quartiles, either for the female employment rate or the internet usage rate. Of course, testing only differences in means doesn’t mean much if the variance within a group is very large. That is where ANOVA - the analysis of variance - comes in. This method can tell us if the variance within a group is small enough for the variance between two groups to be significant (i.e. “Can I trust this difference between my groups?”). Before we test that, though, let’s have a look at the actual means and standard deviations of the data!
# calculate and print means for only the two interesting variables in the breast cancer groups
print ("means for female employment and internet usage by breast cancer quartiles")
print(sub_data2.groupby("breastquart")["femaleemployrate", "internetuserate"].mean())
# calculate and print standard deviations for only the two interesting variables in the breast cancer groups
print ("\n\nstandard deviations for female employment and internet usage by breast cancer quartiles")
print(sub_data2.groupby("breastquart")["femaleemployrate", "internetuserate"].std())
means for female employment and internet usage by breast cancer quartiles
femaleemployrate internetuserate
breastquart
100th 47.531707 65.645802
25th 56.148780 13.583038
50th 44.407500 17.851427
75th 42.630000 38.971075
standard deviations for female employment and internet usage by breast cancer quartiles
femaleemployrate internetuserate
breastquart
100th 10.192631 18.686857
25th 16.156842 17.237897
50th 15.217860 16.590912
75th 13.342091 21.744734
Not surprisingly, we can see quite an increase in means for internet usage over the breast cancer quartiles (please note the somewhat irritating order of the quartiles!), while the female employment rate shows less pronounced differences. Can these still be significant?
To test for significance (ANOVA), we’re using a function called ols() (ordinary least squares) from the statsmodels.formula.api package. OLS is a powerful linear regression method about which we will apparently learn in a later course.
# using ols() function for calculating the F-statistic and associated p-value
breast_inet_m = smf.ols(formula='internetuserate ~ C(breastquart)', data=sub_data2)
breast_inet_r = breast_inet_m.fit()
print ("breast cancer versus internet usage\n", breast_inet_r.summary())
breast_empl_m = smf.ols(formula='femaleemployrate ~ C(breastquart)', data=sub_data2)
breast_empl_r = breast_empl_m.fit()
print ("\n\nbreast cancer versus female employment\n", breast_empl_r.summary())
breast cancer versus internet usage
OLS Regression Results
==============================================================================
Dep. Variable: internetuserate R-squared: 0.558
Model: OLS Adj. R-squared: 0.550
Method: Least Squares F-statistic: 66.58
Date: Thu, 05 Nov 2015 Prob (F-statistic): 6.92e-28
Time: 08:52:15 Log-Likelihood: -701.94
No. Observations: 162 AIC: 1412.
Df Residuals: 158 BIC: 1424.
Df Model: 3
Covariance Type: nonrobust
==========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------
Intercept 65.6458 2.915 22.523 0.000 59.889 71.402
C(breastquart)[T.25th] -52.0628 4.122 -12.631 0.000 -60.204 -43.922
C(breastquart)[T.50th] -47.7944 4.148 -11.524 0.000 -55.986 -39.603
C(breastquart)[T.75th] -26.6747 4.148 -6.431 0.000 -34.866 -18.483
==============================================================================
Omnibus: 20.285 Durbin-Watson: 1.911
Prob(Omnibus): 0.000 Jarque-Bera (JB): 24.931
Skew: 0.800 Prob(JB): 3.86e-06
Kurtosis: 4.065 Cond. No. 4.77
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
breast cancer versus female employment
OLS Regression Results
==============================================================================
Dep. Variable: femaleemployrate R-squared: 0.126
Model: OLS Adj. R-squared: 0.109
Method: Least Squares F-statistic: 7.562
Date: Thu, 05 Nov 2015 Prob (F-statistic): 9.28e-05
Time: 08:52:15 Log-Likelihood: -654.33
No. Observations: 162 AIC: 1317.
Df Residuals: 158 BIC: 1329.
Df Model: 3
Covariance Type: nonrobust
==========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------------------
Intercept 47.5317 2.172 21.880 0.000 43.241 51.822
C(breastquart)[T.25th] 8.6171 3.072 2.805 0.006 2.549 14.685
C(breastquart)[T.50th] -3.1242 3.091 -1.011 0.314 -9.230 2.982
C(breastquart)[T.75th] -4.9017 3.091 -1.586 0.115 -11.007 1.204
==============================================================================
Omnibus: 0.513 Durbin-Watson: 1.777
Prob(Omnibus): 0.774 Jarque-Bera (JB): 0.344
Skew: -0.110 Prob(JB): 0.842
Kurtosis: 3.052 Cond. No. 4.77
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The ols() function returns a lot of results (and the warning also seems to be a standard), but for now we’ll only look at the F-statistic and the Prob (F-statistic). For breast cancer versus internet usage, the F-statistic = 66.58 – this is the result of dividing the variance between groups by the variance within groups. The higher the number, therefore, the higher is the variance between groups compared to the variance within groups, so this value is pretty good. Accordingly, the probability that we could see this value simply by chance is very low (p < 0.0001).
The comparison of breast cancer versus female employment, on the other hand, shows a lower F-statistic of 7.562. Nevertheless, the p-value is still below the common threshold of 0.05 that is mostly used in science (p = 0.0000928). This means that, according to the ANOVA/OLS, the difference between groups is also significant in this comparison.
The problem we have now, with the ANOVA results, is that they don’t tell us which groups differ significantly from each other. Seeing ANOVA as a hypothesis test, we’d say that the null hypothesis is “all means are equal”, while the alternative hypothesis is simply “not all means are equal”. While we know now that there are significant differences in the data, we still don’t know which group’s means are (significantly) not equal. To figure that out, we need a post hoc test like Tukey’s HSD test.
# do multiple comparisons and Tukey HSD
breast_inet_mc = multi.MultiComparison(sub_data2['internetuserate'], sub_data2['breastquart'])
breast_inet_rc = breast_inet_mc.tukeyhsd()
print("breast cancer versus internet usage\n", breast_inet_rc.summary())
breast_empl_mc = multi.MultiComparison(sub_data2['femaleemployrate'], sub_data2['breastquart'])
breast_empl_rc = breast_empl_mc.tukeyhsd()
print("\n\nbreast cancer versus female employment\n", breast_empl_rc.summary())
breast cancer versus internet usage
Multiple Comparison of Means - Tukey HSD,FWER=0.05
===============================================
group1 group2 meandiff lower upper reject
-----------------------------------------------
100th 25th -52.0628 -62.7661 -41.3594 True
100th 50th -47.7944 -58.5644 -37.0243 True
100th 75th -26.6747 -37.4448 -15.9047 True
25th 50th 4.2684 -6.5017 15.0384 False
25th 75th 25.388 14.618 36.1581 True
50th 75th 21.1196 10.2833 31.956 True
-----------------------------------------------
breast cancer versus female employment
Multiple Comparison of Means - Tukey HSD,FWER=0.05
==============================================
group1 group2 meandiff lower upper reject
----------------------------------------------
100th 25th 8.6171 0.6393 16.5948 True
100th 50th -3.1242 -11.1517 4.9033 False
100th 75th -4.9017 -12.9292 3.1258 False
25th 50th -11.7413 -19.7688 -3.7138 True
25th 75th -13.5188 -21.5463 -5.4913 True
50th 75th -1.7775 -9.8544 6.2994 False
----------------------------------------------
This has much less output than the OLS. In the headline, we are told which kind of test was used (Tukey HSD) and how the multiple comparison problem was corrected.
In case you don’t know what I’m talking about: when we do a single statistical test that returns a p-value, this p-value tells us how likely it is that we were wrong in rejecting our null hypothesis. Usually, you hope for p < 0.05, or less than 5% - meaning a 5% chance of making a type I error (rejecting the null hypothesis even though it is true). I’ll skip the math here and try with common sense instead: the more tests you do on subsets of your data, the higher are your chances to find some random effect - for only four tests, the probability of making at least one mistake is already 0.185 and not 0.05 or below any more. This is called the family wise error rate, or FWER - the probability of making at least one mistake (i.e. type I error).
The result summary printed above shows that the FWER was taken into account automatically, and then lists all the group-wise comparisons, their differences, and whether or not we can reject the null hypothesis (that both groups have equal means).
For the breast cancer versus internet usage comparison - my main topic -, all quartiles have significantly different means, except the 25th and 50th. From a statistics point of view, it’s quite safe to say that there is a significant difference in the internet use rates for different groups of new breast cancer cases, with F(3, 158) = 66.58 (the numbers in brackets are degrees of freedom, df model and df residuals, from the ANOVA summary above) and p < 0.0001. Additionally, the post hoc test revealed that countries with more breast cancer cases also show significantly higher internet use rates (except when comparing the first two quartiles).
The means of female employment rates, when looking at the same breast cancer quartiles, are also significantly different: F(3, 158) = 7.562, p < 0.0001. Interestingly, here it is only the 25th quartile that shows significantly higher female employment rates (56.15% ± 16.16) than the other quartiles. Apparently, significantly more women were working (in 2007) in countries with only few breast cancer cases (as of 2002) than in countries with higher breast cancer discovery rates.
Ignoring “corrleation does not mean causation”, we can now imply that breast cancer indeed causes internet usage (people looking for help and information), and that it also leads to lower female employment rates (because women with cancer don’t go to work). soNor"%4L���
0 notes