在需要良好模型可解释性的应用中,决策树效果非常好,尤其是在深度较小的情况下。然而,具有真实世界数据集的决策树具有很大的深度,深度较高的决策树更容易过度拟合,从而导致模型的方差更大。随机森林模型探索了决策树的这一缺点。在随机森林模型中,原始训练数据是随机抽样并替换的,从而生成小的数据子集(见下图)。这些子集也称为引导样本。然后,这些引导样本作为训练数据输入到许多大深度的决策树中。每个决策树都基于这些引导样本单独训练。这种决策树的聚合称为随机森林集成。集成模型的最终结果通过计算所有决策树的多数票来确定。由于每个决策树都以不同的训练数据集作为输入,因此原始训练数据集中的偏差不会影响从决策树聚合中获得的最终结果。
随机森林算法有三个主要超参数,需要在训练之前设置。这些包括节点大小、树的数量和采样的特征数量。从这里开始,随机森林分类器可用于解决回归或分类问题。随机森林算法由一组决策树组成,集合中的每棵树都由从有替换训练集中抽取的数据样本组成,称为引导样本。在该训练样本中,三分之一被留作测试数据,称为袋外样本。然后通过特征装袋注入另一个随机性实例,为数据集增加更多多样性并降低决策树之间的相关性。根据问题的类型,预测的确定会有所不同。对于回归任务,将对各个决策树进行平均,对于分类任务,多数投票(即最常见的分类变量)将产生预测类。最后,然后使用袋外样本进行交叉验证,最终确定该预测。
Copy import pandas as pd
import numpy as np
from umap import UMAP
import seaborn as sns
import matplotlib . pyplot as plt
from mpl_toolkits . axes_grid1 . anchored_artists import AnchoredSizeBar
from sklearn . model_selection import train_test_split
from sklearn . ensemble import RandomForestClassifier
from sklearn . metrics import accuracy_score
from sklearn . cluster import AgglomerativeClustering
from sklearn . neighbors import kneighbors_graph
from sklearn . cluster import KMeans
from sklearn . metrics import silhouette_samples , silhouette_score
from sklearn . metrics import classification_report
from sklearn . metrics import confusion_matrix
Copy mypath = '*insert file path/waveforms.csv'
data = pd . read_csv (mypath, index_col = 'uid' )
print(f'{data.shape[0]} unique experiment identifiers (uid), recorded with a sampling frequency (KHz) of {((data.shape[1]-1)/5)}')
data . organoid . value_counts ()
Copy mycolors = { 'Data_D' : '#FFA500' ,
'Data_G' : '#4169E1' ,
'Data_F' : '#FF4500' ,
'Data_C' : '#9400D3' ,
'Data_A' : '#32CD32' ,
'Data_E' : '#228B22' ,
'Data_G_V2' : '#006400' ,
'Data_H' : '#00BFFF' ,
'Data_E_V2' : '#DC143C' ,
'Data_F_V2' : '#0000FF' ,
'Data_B' : '#000000' ,
}
data [ 'color' ] = data [ 'organoid' ]. apply ( lambda orgID : mycolors[orgID])
Copy fig , ax = plt . subplots (figsize = ( 15 , 8 ))
sns . barplot (x = data.organoid. value_counts ().index, y = data.organoid. value_counts (), palette = mycolors)
plt . xticks (rotation = 30 ,fontsize = 14 )
plt . yticks (fontsize = 14 )
ax . set_xlabel ( 'Class type' , fontsize = 16 )
ax . set_ylabel ( 'Number of waveforms' , fontsize = 16 )
plt . rcParams [ "font.family" ] = "Arial"
right_side = ax . spines [ "right" ]
right_side . set_visible ( False )
top_side = ax . spines [ "top" ]
top_side . set_visible ( False )
plt . savefig ( 'Figures/barplot.png' , dpi = 300 , bbox_inches = "tight" )
plt . show ()
Copy class_names = data [ 'organoid' ]. unique ()
fig , ax = plt . subplots ( 1 , 9 , figsize = ( 24 , 4.5 ))
for unique_class in class_names :
df_new = data [ data [ 'organoid' ] == unique_class ]
df_new = df_new . iloc [:,: - 2 ]. to_numpy ()
data_mean_perclass = np . mean (df_new, axis = 0 )
sampling_freq = np . linspace ( 0 , 5 , 150 )
for i in range (class_names.shape[ 0 ]):
if unique_class == class_names [ i ]:
for row_num in range (df_new.shape[ 0 ]):
ax [ i ]. plot (sampling_freq, df_new[row_num,:], color = 'lightgray' )
ax [ i ]. plot (sampling_freq,data_mean_perclass, color = mycolors[unique_class], linewidth = 3 )
ax [ i ]. set_ylim ([ - 1.8 , 1.8 ])
ax [ i ]. grid ()
ax [ i ]. axis ( 'off' )
ax [ i ]. title . set_text (unique_class)
plt . rcParams [ "font.family" ] = "Arial"
else :
continue
scalebar = AnchoredSizeBar(ax[8].transData, 1, "1 ms", 'lower right', frameon=False, size_vertical=0.02, pad=0.1)
ax [ 8 ]. add_artist (scalebar)
plt . savefig ( 'Figures/spikes.png' , dpi = 300 )