"""
__author__ = 'jockqiao'
__date__ = '2018/9/13 9:36'
__email__ = 'LQ65535@163.com'
"""
import pandas as pd
import numpy as np
def Chi2(df, total_col, bad_col, overallRate):
'''
#此函数计算卡方值
:df dataFrame
:total_col 每个值得总数量
:bad_col 每个值的坏数据数量
:overallRate 坏数据的占比
: return 卡方值
'''
df2 = df.copy()
df2['expected'] = df[total_col].apply(lambda x: x * overallRate)
combined = zip(df2['expected'], df2[bad_col])
chi = [(i[0] - i[1]) ** 2 / i[0] for i in combined]
chi2 = sum(chi)
return chi2
# 基于卡方阈值卡方分箱,有个缺点,不好控制分箱个数。
def ChiMerge_MinChisq(df, col, target, confidenceVal=3.841):
'''
#此函数是以卡方阈值作为终止条件进行分箱
: df dataFrame
: col 被分箱的特征
: target 目标值,是0,1格式
: confidenceVal 阈值,自由度为1, 自信度为0.95时,卡方阈值为3.841
: return 分箱。
这里有个问题,卡方分箱对分箱的数量没有限制,这样子会导致最后分箱的结果是分箱太细。
'''
# 对待分箱特征值进行去重
colLevels = set(df[col])
# count是求得数据条数
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
# sum是求得特征值的和
# 注意这里的target必须是0,1。要不然这样求bad的数据条数,就没有意义,并且bad是1,good是0。
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
# 对数据进行合并,求出col,每个值的出现次数(total,bad)
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
# 求出整的数据条数
N = sum(regroup['total'])
# 求出黑名单的数据条数
B = sum(regroup['bad'])
overallRate = B * 1.0 / N
# 对待分箱的特征值进行排序
colLevels = sorted(list(colLevels))
groupIntervals = [[i] for i in colLevels]
groupNum = len(groupIntervals)
while (1):
if len(groupIntervals) == 1:
break
chisqList = []
for interval in groupIntervals:
df2 = regroup.loc[regroup[col].isin(interval)]
chisq = Chi2(df2, 'total', 'bad', overallRate)
chisqList.append(chisq)
min_position = chisqList.index(min(chisqList))
if min(chisqList) >= confidenceVal:
break
if min_position == 0:
combinedPosition = 1
elif min_position == groupNum - 1:
combinedPosition = min_position - 1
else:
if chisqList[min_position - 1] <= chisqList[min_position + 1]:
combinedPosition = min_position - 1
else:
combinedPosition = min_position + 1
groupIntervals[min_position] = groupIntervals[min_position] + groupIntervals[combinedPosition]
groupIntervals.remove(groupIntervals[combinedPosition])
groupNum = len(groupIntervals)
return groupIntervals
# 最大分箱数分箱
def ChiMerge_MaxInterval_Original(df, col, target, max_interval=5):
'''
: df dataframe
: col 要被分项的特征
: target 目标值 0,1 值
: max_interval 最大箱数
:return 箱体
'''
colLevels = set(df[col])
colLevels = sorted(list(colLevels))
N_distinct = len(colLevels)
if N_distinct <= max_interval:
print("the row is cann't be less than interval numbers")
return colLevels[:-1]
else:
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
overallRate = B * 1.0 / N
groupIntervals = [[i] for i in colLevels]
groupNum = len(groupIntervals)
while (len(groupIntervals) > max_interval):
chisqList = []
for interval in groupIntervals:
df2 = regroup.loc[regroup[col].isin(interval)]
chisq = Chi2(df2, 'total', 'bad', overallRate)
chisqList.append(chisq)
min_position = chisqList.index(min(chisqList))
if min_position == 0:
combinedPosition = 1
elif min_position == groupNum - 1:
combinedPosition = min_position - 1
else:
if chisqList[min_position - 1] <= chisqList[min_position + 1]:
combinedPosition = min_position - 1
else:
combinedPosition = min_position + 1
# 合并箱体
groupIntervals[min_position] = groupIntervals[min_position] + groupIntervals[combinedPosition]
groupIntervals.remove(groupIntervals[combinedPosition])
groupNum = len(groupIntervals)
groupIntervals = [sorted(i) for i in groupIntervals]
print(groupIntervals)
cutOffPoints = [i[-1] for i in groupIntervals[:-1]]
return cutOffPoints
# 计算WOE和IV值
def CalcWOE(df, col, target):
'''
: df dataframe
: col 注意这列已经分过箱了,现在计算每箱的WOE和总的IV
:target 目标列 0-1值
:return 返回每箱的WOE和总的IV
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
regroup['good'] = regroup['total'] - regroup['bad']
G = N - B
regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x * 1.0 / B)
regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt * 1.0 / x.bad_pcnt), axis=1)
WOE_dict = regroup[[col, 'WOE']].set_index(col).to_dict(orient='index')
IV = regroup.apply(lambda x: (x.good_pcnt - x.bad_pcnt) * np.log(x.good_pcnt * 1.0 / x.bad_pcnt), axis=1)
IV_SUM = sum(IV)
return {'WOE': WOE_dict, 'IV_sum': IV_SUM, 'IV': IV}
# 分箱以后检查每箱的bad_rate的单调性,如果不满足,那么继续进行相邻的两项合并,直到bad_rate单调为止
def BadRateMonotone(df, sortByVar, target):
# df[sortByVar]这列已经经过分箱
df2 = df.sort_values(by=[sortByVar])
total = df2.groupby([sortByVar])[target].count()
total = pd.DataFrame({'total': total})
bad = df2.groupby([sortByVar])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
combined = zip(regroup['total'], regroup['bad'])
badRate = [x[1] * 1.0 / x[0] for x in combined]
badRateMonotone = [badRate[i] < badRate[i + 1] for i in range(len(badRate) - 1)]
Monotone = len(set(badRateMonotone))
if Monotone == 1:
return True
else:
return False
# 检查最大箱,如果最大箱里面数据数量占总数据的90%以上,那么弃用这个变量
def MaximumBinPcnt(df, col):
N = df.shape[0]
total = df.groupby([col])[col].count()
pcnt = total * 1.0 / N
return max(pcnt)
# 对于类别型数据,以bad_rate代替原有值,转化成连续变量再进行分箱计算。比如我们这里的户籍地代码,就是这种数据格式
# 当然如果类别较少时,原则上不需要分箱
def BadRateEncoding(df, col, target):
'''
: df DataFrame
: col 需要编码成bad rate的特征列
:target值,0-1值
: return: the assigned bad rate
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
regroup['bad_rate'] = regroup.apply(lambda x: x.bad * 1.0 / x.total, axis=1)
br_dict = regroup[[col, 'bad_rate']].set_index([col]).to_dict(orient='index')
badRateEnconding = df[col].map(lambda x: br_dict[x]['bad_rate'])
return {'encoding': badRateEnconding, 'br_rate': br_dict}
class Woe_IV:
def __init__(self, df, colList, target):
'''
:param df: 这个是用来分箱的dataframe
:param colList: 这个分箱的列数据,数据结构是一个字段数组
例如colList=[
{
'col':'openning_room_num_n3'
'bandCol':'openning_room_num_n3_band',
'bandNum':6,
‘toCsvPath':'/home/liuweitang/yellow_model/data/mk/my.txt'
},
]
:param target 目标列0-1值,1表示bad,0表示good
'''
self.df = df
self.colList = colList
self.target = target
def to_band(self):
for i in range(len(self.colList)):
colParam = self.colList[i]
# 计算出箱体分别值,返回的是一个长度为5数组[0,4,13,45,78]或者长度为6的数组[0,2,4,56,67,89]
cutOffPoints = ChiMerge_MaxInterval_Original(self.df, colParam['col'], self.target, colParam['bandNum'])
print(cutOffPoints)
indexValue = 0
value_band = []
# 那么cutOffPoints第一个值就是作为一个独立的箱
if len(cutOffPoints) == colParam['bandNum'] - 1:
print('len-1 type')
for i in range(0, len(cutOffPoints)):
if i == 0:
self.df.loc[self.df[colParam['col']] <= cutOffPoints[i], colParam['bandCol']] = indexValue
indexValue += 1
value_band.append('0-' + str(cutOffPoints[i]))
if 0 < i < len(cutOffPoints):
self.df.loc[(self.df[colParam['col']] > cutOffPoints[i - 1]) & (
self.df[colParam['col']] <= cutOffPoints[i]), colParam['bandCol']] = indexValue
indexValue += 1
value_band.append(str(cutOffPoints[i - 1] + 1) + "-" + str(cutOffPoints[i]))
if i == len(cutOffPoints) - 1:
self.df.loc[self.df[colParam['col']] > cutOffPoints[i], colParam['bandCol']] = indexValue
value_band.append(str(cutOffPoints[i] + 1) + "-")
# 那么就是直接分割分箱,
if len(cutOffPoints) == colParam['bandNum']:
print('len type')
for i in range(0, len(cutOffPoints)):
if 0 < i < len(cutOffPoints):
self.df.loc[(self.df[colParam['col']] > cutOffPoints[i - 1]) & (
self.df[colParam['col']] <= cutOffPoints[i]), colParam['bandCol']] = indexValue
value_band.append(str(cutOffPoints[i - 1] + 1) + "-" + str(cutOffPoints[i]))
indexValue += 1
if i == len(cutOffPoints) - 1:
self.df.loc[self.df[colParam['col']] > cutOffPoints[i], colParam['bandCol']] = indexValue
value_band.append(str(cutOffPoints[i] + 1) + "-")
self.df[colParam['bandCol']].astype(int)
# 到此分箱结束,下面判断单调性
isMonotone = BadRateMonotone(self.df, colParam['bandCol'], self.target)
# 如果不单调,那就打印出错误,并且继续执行下一个特征分箱
if isMonotone == False:
print(colParam['col'] + ' band error, reason is not monotone')
continue
# print('-----------------', colParam['col'])
# 单调性判断完之后,就要计算woe_IV值
woe_IV = CalcWOE(self.df, colParam['bandCol'], self.target)
woe = woe_IV['WOE']
woe_result = []
for i in range(len(woe)):
woe_result.append(woe[i]['WOE'])
iv = woe_IV['IV']
iv_result = []
for i in range(len(iv)):
iv_result.append(iv[i])
good_bad_count = self.df.groupby([colParam['bandCol'], self.target]).label.count()
good_count = []
bad_count = []
for i in range(0, colParam['bandNum']):
good_count.append(good_bad_count[i][0])
bad_count.append(good_bad_count[i][1])
print(value_band)
print(good_count)
print(bad_count)
print(woe_result)
print(iv_result)
# 将WOE_IV值保存为dataframe格式数据,然后导出到csv
# 这里其实还有个问题,就是
woe_iv_df = pd.DataFrame({
'IV': iv_result,
'WOE': woe_result,
'bad': bad_count,
'good': good_count,
colParam['bandCol']: value_band
})
bad_good_count = self.df.groupby([colParam['bandCol'], self.target])[self.target].count()
print('bad_good_count{}'.format(bad_good_count))
woe_iv_df.to_csv(colParam['toCsvPath'])
print(colParam['col'] + 'band finished')
openning_data = pd.read_excel('chi.xlsx')
colList = [
{
'col': 'openning_room_0_6_num_n3',
'bandCol': 'openning_room_0_6_num_n3_band',
'bandNum': 4,
'toCsvPath': 'openning_room_0_6_num_n3_band.csv'
},
]
band2 = Woe_IV(openning_data, colList, 'label')
band2.to_band()
卡方分箱数据集
原创文章,转载请注明出处:http://124.221.219.47/article/kafang/