1.文本
2.数据统计代码
def get_text_length(dir): entities={} text_max=0 i=0 text_min=999999 files=os.listdir(dir) files=list(set([file.split('.')[0] for file in files if (".ann" in file or ".txt" in file) ])) l=[0]*3037 for file in files: path=os.path.join(dir,file+'.txt') lens=0 with open(path,'r',encoding='utf8') as f: for line in f.readlines(): lens=len(line) l[lens]+=1 i+=1 if(lens<50): print(line) text_max=max(text_max,lens) text_min=min(text_min,lens) return text_max,text_min,l tmax,tmin,l=get_text_length(test_dir) print(tmax,tmin)3.绘画直方图
plt.figure(figsize=(18,8)) plt.title('Length statistics of text',fontsize=13) plt.xlabel(u'length',fontsize=13) plt.ylabel(u'quantity',fontsize=13) #plt.bar(data[i for i in count.keys()],data[i for i in count.values()],alpha=0.6,width=0.8,facecolor='deeppink',edgecolor='darkblue',w=1,label='number of class') plt.bar(range(len(l)),l,width=0.8,edgecolor='#0077CC',lw=1) fig=plt.gcf() #plt.legend(loc=2) plt.show() fig.savefig('./length_statistics_of_text.png')1.文本
2.统计数据代码
def get_entities(dir): entities={} text_max=0 text_min=999999 files=os.listdir(dir) lens=0 files=list(set([file.split('.')[0] for file in files if (".ann" in file or ".txt" in file) ])) for file in files: path=os.path.join(dir,file+'.ann') with open(path,'r',encoding='utf8') as f: for line in f.readlines(): name=line.split('\t')[2] # print(name) lens=len(name) if(lens==1 or lens==13): print(line) if lens in entities: entities[lens]+=1 else: entities[lens]=1 text_max=max(text_max,lens) text_min=min(text_min,lens) return text_max,text_min,entities # print(name) # if name in entities: # entities[name]+=1 # else: # entities[name]=1 # return entities text_max,text_min,entities=get_entities(train_dir) text_max,text_min,entities3.直方图
from matplotlib import pyplot as plt print(entities) entities_order=sorted(entities.items(),key=lambda x:x[0],reverse=False)# # 按字典集合中,每一个元组的第一个元素排列,相当于字典集合中遍历出来的一个元组。 #dict(list(entities_order)) print(entities_order) # print([i for i in entities_order.keys()]) # print([i for i in entities_order.values()]) dic={i[0]:i[1] for i in entities_order}#列表转换为字典 print(dic) print([i for i in dic.keys()]) print([i for i in dic.values()]) num_list = [i for i in dic.values()] num_list[-2]+=1 num_list=num_list[:-1] def autolabel(rects): for rect in rects: height =rect.get_height() plt.text(rect.get_x() + rect.get_width()/2, height, height, ha='center', va='bottom')#垂直和水平的布局 #rect.get_x(),1.03*height,'%s' % int(height)) plt.xticks(range(len(num_list)), name_list, rotation=0) plt.figure(figsize=(18,8)) plt.title('Length statistics of entity',fontsize=13) plt.xlabel(u'length',fontsize=13) plt.ylabel(u'quantity',fontsize=13) #plt.bar(data[i for i in count.keys()],data[i for i in count.values()],alpha=0.6,width=0.8,facecolor='deeppink',edgecolor='darkblue',w=1,label='number of class') autolabel(plt.bar(range(len(num_list)),num_list,width=0.8,edgecolor='darkblue',lw=1)) fig=plt.gcf() #plt.legend(loc=2) plt.show() fig.savefig('./length_statistics_of_entity.png')4.结果