continuous -> categorical 변수로 만들기
[age]
/***sas***/
DATA WORK.HD1;
SET HD_origin;
do i = 0 to 9;
if age >= i*10 & age < (i+1)*10 then age_c = i;
end;
drop i;
### python
hd["age_c"] = hd["age"].apply(lambda x: x//10)
[BMI] (결측값 존재)
/***sas***/
IF BMI='.' then bmi_c="NT"; /* BMI missing 처리-안할경우 'L'로 분류됨 */
else IF BMI<25 then bmi_c='L';
ELSE bmi_c='H';
RUN;
quartile 나누기
/***sas***/
DATA HD2; /*결측치 제거한 새로운 dataset*/
SET HD1;
IF BMI='.' then delete;
BMI_num = input(bmi,8.);
run;
proc rank data=WORK.HD2 groups=4 out=work.HD2;
var BMI_num;
ranks BMI_q;
run;
###python
hd_bmi = hd.dropna(subset=["BMI"])
hd_bmi = hd_bmi[hd_bmi['BMI']!= '.']
hd_bmi['bmi_q'] = pd.qcut(hd_bmi['BMI'], 4, labels=False)
chi square (p-value)
/***sas***/
proc freq data=HD2;
tables sex*lung_cancer sex*cancer sex*pneumonia sex*asthma
sex*RD sex*CD sex*BD sex*ENTD sex*DD sex*OD sex*heart / chisq relrisk;
exact pchi or;
run;
###python
import scipy.stats as stats
from scipy.stats import chi2_contingency
disease = ['lung_cancer', 'cancer', 'pneumonia',
'asthma', 'RD', 'CD', 'BD', 'ENTD', 'DD', 'OD', 'heart']
for i in disease:
crosstab = pd.crosstab(hd[i], hd["occupation"])
if stats.chi2_contingency(crosstab)[1] > 0.0001:
print("{} :".format(i), stats.chi2_contingency(crosstab)[1]) # p-value >= 0.0001만 출력