数据挖掘中 决策树算法实现――Bash(二)

2014-11-24 08:51:39 · 作者: · 浏览: 1
attrc=${#attr[@]};
for((i=0;i do
a=${attr[$i]};
index=${header_info[$a]};
grepstra[$index]=${attrvs[$i]};
done
fi

for((i=0;i do
if [ -z ${grepstra[$i]} ]; then
grepstra[$i]=".*";
fi
done
grepstrt=${grepstra[*]};
grepstr=${grepstrt// /,};
grep $grepstr $data > current_node_data

## calculate the entropy before split the records
entropy=0;
input=`cat current_node_data | cut -d "," -f 5 | sort | uniq -c | sed 's/^ \+//g' | cut -d " " -f 1`
getEntropy "$input" entropy;

## calculate the entropy for each of the rest attrs
## and select the min one
min_attr_entropy=1;
min_attr_name="";
min_attr_index=0;
for((i=0;i do
## just use the rest attrs
if [[ "$attrs" != *"${headers[$i]}"* ]]; then
## calculate the entropy for the current attr
### get the different values for the headers[$i]
j=$((i+1));
cut -d "," -f $j,$length current_node_data > tmp_attr_ds
dist_values=`cut -d , -f 1 tmp_attr_ds | sort | uniq -c | sed 's/^ \+//g' | sed 's/ /,/g'`;
totle=0;
totle_entropy_attr=0;
for k in $dist_values
do
info=(${k//,/ });
((totle+=${info[0]}));
cur_class_input=`grep "^${info[1]}," tmp_attr_ds | cut -d "," -f 2 | sort | uniq -c | sed 's/^ \+//g' | cut -d " " -f 1`
cur_attr_value_entropy=0;
getEntropy "$cur_class_input" cur_attr_value_entropy;
totle_entropy_attr=$(awk -v c=${info[0]} -v e=$cur_attr_value_entropy -v o=$totle_entropy_attr 'BEGIN{print c*e+o;}');
done
attr_entropy=$(awk -v e=$totle_entropy_attr -v c=$totle 'BEGIN{print e/c;}');
if [ $(echo "$attr_entropy < $min_attr_entropy" | bc) = 1 ]; then
min_attr_entropy=$attr_entropy;
min_attr_name="${headers[$i]}";
min_attr_index=$j;
fi
fi
done

## calculate the gain between the original entropy of the current node
## and the entropy after split by the attribute which has the min_entropy
gain=$(awk -v b=$entropy -v a=$min_attr_entropy 'BEGIN{print b-a;}');

## when the gain is large than 0.1 and then put it as a branch
## and add the child nodes to the current node and push the index to the trace_stack
## otherwise make it as a leaf node and get the class flag
## and do not push trace_stack
if [ $(echo "$gain > 0.1" | bc) = 1 ]; then
### get the attribute values
attr_values_str=`cut -d , -f $min_attr_index current_node_data | sort | uniq`;
attr_values=($attr_values_str);

### generate the node and add to the tree and add their index to the trace_stack
tree_store_length=${#descision_tree[@]};
current_node_struct[0]=$tree_store_length;
parent_node_index=$current_node_index;

attr_value_c=${#attr_values[@]};
for((i=0;i do
tree_store_length=${#descision_tree[@]};
slibling=0;
if [ $i -lt $((attr_value_c-1)) ]; then
slibling=$((tree_store_length+1));
fi

new_attr="";
new_attrvalue="";
if [ $attrs != "N" ]; then
new_attr="$attrs,$min_attr_name";
new_attrvalue="$attrv,${attr_values[$i]}";
else
new_attr="$min_attr_name";
new_attrvalue="${attr_values[$i]}";
fi
new_node="0:$slibling:$parent_node_index:$new_attr:$new_attr_value:0:0";
descision_tree[$tree_store_length]="$new_node";
trace_stack[$stack_deep]=$tree_store_length;
((stack_deep+=1));
done
current_node_update=${current_node_struct[*]};
descision_tree[$current_node_index]=${current_node