Modern poetry 现代诗数据库爬取过程
Table of Contents
前言⌗
Json 结构⌗
要爬取数据首先我们要了解我们打算怎么存储它。Modern Poetry 采用的是 Json 结构分发数据,包含的信息分为两种:
- 作者
- 诗
作者的目录结构就相对简单,主要分为 name, src(爬取到作者的地址), id(根据作者姓名生成的唯一id), description(作者简介)
[{
"name": "author-name",
"src": "URL",
"id": "",// Generated by uuid3 in python: uuid.uuid3(uuid.NAMESPACE_URL, author-name)
"description": "Description about author"
}, ...
]
诗的 Json 文件则比较复杂,分两种情况讨论:
- 原文
- 翻译
首先是原文的格式,author, title, paragraphs, id
[{
"author":"author-name",
"title":"poem-title",
"paragraphs":[
"sentence-1","sentence-2"
],
"id":"" // Generated by uuid3 in python: uuid.uuid3(uuid.NAMESPACE_URL, author-name)
}, ...
]
重点需要强调的是 paragraphs 的结构,paragraph 的内容是根据换行来划分的,也就是数组内每一项都是一行的内容。
翻译则采用以下格式:
[{
"author":"author-name(Chinese name)",
"title":"poem-title",
"translation":[
"sentence-1","sentence-2"
],
"id":"", // Generated by uuid3 in python
"origin":"poems' origin name"
}, ...
]
需要添加的就是 origin 这个东西,并且把 paragraphs 替换为 translation。
数据来源⌗
目前项目第一阶段中国近现代诗的内容,我选取了两个网站作为获取数据的主要来源:
第一个网站为 github 的仓库,里面的诗歌内容采用 csv 格式存储;第二个网站为 中国现代诗歌文库编委 的一个中国现代诗的网站,数据需要爬取。
总的看起来数据处理主要分为两个方向:
- 处理 csv 数据
- 爬取网站数据
获取数据⌗
处理 csv⌗
读取 csv 采用的主要是 python 的 csv 库,生成 id 采用了 python 的 uuid 库的 uuid3 方法,json 数据的存储采用了 python 内置的 json 库。
import csv
import uuid
import json
def csv_1():
poemFinal = []
with open('1.csv', encoding='utf-8')as f:
f_csv = csv.reader(f)
count = 0
next(f_csv, None)
for i in f_csv:
dictAppend = {}
title, classify, author, para = i
dictAppend['author'] = author
dictAppend['title'] = title
dictAppend['paragraphs'] = para
dictAppend['id'] = str(uuid.uuid3(uuid.NAMESPACE_URL, author))
poemFinal.append(dictAppend)
count += 1
if (count+1) % 500 == 0:
json.dump(poemFinal,open("csv/" + str(count) + '_csv.json','w', encoding='utf-8'), ensure_ascii=False)
poemFinal = []
json.dump(poemFinal,open("csv/" + str(count) + '_csv.json','w', encoding='utf-8'), ensure_ascii=False)
def csv_2():
poemFinal = []
with open('2.csv', encoding='utf-8')as n:
f_csv = csv.reader(n)
count = 0
next(f_csv, None)
for i in f_csv:
dictAppend = {}
title, classify, author, para = i
dictAppend['author'] = author
dictAppend['title'] = title
dictAppend['paragraphs'] = para
dictAppend['id'] = str(uuid.uuid3(uuid.NAMESPACE_URL, author))
poemFinal.append(dictAppend)
count += 1
if (count+1) % 500 == 0:
json.dump(poemFinal,open("csv2/" + str(count) + '_csv.json','w', encoding='utf-8'), ensure_ascii=False)
poemFinal = []
json.dump(poemFinal,open("csv2/" + str(count) + '_csv.json','w', encoding='utf-8'), ensure_ascii=False)
def author(file):
authorFinal = []
with open(file, encoding='utf-8')as f:
author = []
f_csv = csv.reader(f)
column = [row[2] for row in f_csv]
for i in column:
if i not in author:
author.append(i)
for k in author:
dictAuthor = {}
dictAuthor = {"name":k,"src":"https://github.com/Werneror/Poetry","id":str(uuid.uuid3(uuid.NAMESPACE_URL, k)),"description":""}
authorFinal.append(dictAuthor)
json.dump(authorFinal,open(file + '.min.json','w', encoding='utf-8'), ensure_ascii=False)
csv_1()
csv_2()
author("1.csv")
author("2.csv")
csv_1 和 csv_2 就是为这两个文件中的诗生成 json 文件的函数。author 函数顾名思义就是为两个文件中的作者生成 json 文件。
爬取网站⌗
处理完相对简单的 csv 文件之后就要爬取网站了。爬取这个网站会遇到两个难点:
网站排版混乱
- 表面上这个网站看起来非常简单,一致,但是如果多打开几个诗人的诗歌页面你就会发现,采用简单的正则表达式是无法匹配出所需要的内容的,所以需要改变传统的方法。这个网站一个非常明显的,每一个页面都会有的特点就是每首诗采用
<hr />
来间隔开。所以提取该网站所有的诗的内容的核心就是先分段提取<hr />
之间的内容,再分别匹配标题和诗歌内容。
- 表面上这个网站看起来非常简单,一致,但是如果多打开几个诗人的诗歌页面你就会发现,采用简单的正则表达式是无法匹配出所需要的内容的,所以需要改变传统的方法。这个网站一个非常明显的,每一个页面都会有的特点就是每首诗采用
中文乱码
- 再爬取的过程中,尽管采用了 utf-8 ,仍然有一些奇怪的字符会在爬取保存之后乱码。这个问题的最终解决方案就是把 编码改为gb18030。
#coding=utf-8
import uuid
import re
import requests
import json
from lxml import etree
requests.adapters.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False
link = 'https://www.shigeku.org/xlib/xd/sgdq'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3464.0 Safari/537.36"}
def findAll(regex, seq): # 获取所有的 <hr />的内容,因为 python re.findall 的正则表达式无法重复匹配所以需要单独写一个函数
resultlist=[]
pos=0
while True:
result = regex.search(seq, pos)
if result is None:
break
resultlist.append(seq[result.start():result.end()])
pos = result.start()+1
return resultlist
def parse(List): # 清空数组中的空项
while '' in List:
List.remove('')
return List
def parseString(string): # 清除字符串中的 HTML 的标签
str_= ''
flag = 1
for ele in string:
if ele == "<":
flag = 0
elif ele == '>':
flag = 1
continue
if flag == 1:
str_ += ele
return str_.replace('\r','').replace('\n','').replace(u'\u3000','').replace(u'\ue004','').replace(u'\ue003','').strip()
def author():
html = s.get(link, headers=headers)
html.encoding = 'gb18030'
if(html.status_code == requests.codes.ok):
txt = html.text
authorList = re.findall('<td align=left width=10%>(.*?)</td>', txt, re.S)
authorList = parse(authorList)
authorListFinal = []
for i in range(0, len(authorList)):
authorDict = {}
name = re.findall('>(.*?)<', authorList[i])
name = name[0]
src = re.findall('href=(.*?)>', authorList[i])
src = src[0]
idAuthor = uuid.uuid3(uuid.NAMESPACE_URL, name)
authorDict['name'] = name
authorDict['src'] = src.replace('.htm','')
authorDict['id'] = str(idAuthor)
authorListFinal.append(authorDict)
authorDesc = s.get(link + '/' + src, headers=headers)
authorDesc.encoding = 'gb18030'
# Add author description
xpathHtml = etree.HTML(authorDesc.text)
authorDescription = xpathHtml.xpath('/html/body/p[2]//text()')
if len(authorDescription) == 0:
authorDescription = ''
else:
if len(authorDescription[0]) < 5:
authorDescription = ''
else:
authorDescription = authorDescription[0].replace('\n','').replace('\r','').strip()
authorDict['Description'] = authorDescription
print("Finish ", i)
json.dump(authorListFinal,open(r'author.json','w', encoding='gb18030'), ensure_ascii=False)
print("Finish!")
def poem():
authorPoemPre = json.load(open('author.json', 'r'))
poemList = []
for i in range(0,len(authorPoemPre)):
dictAuthor = authorPoemPre[i]
src = dictAuthor['src'] + '.htm'
poemHtml = s.get(link + '/' + src, headers=headers)
poemHtml.encoding = 'gb18030'
txt = poemHtml.text
pattern = re.compile("<hr />(.*?)<hr />",re.S)
tempHrList = findAll(pattern, txt)
for k in range(0,len(tempHrList)):
dictFinalPoem = {"author":dictAuthor['name']}
st = tempHrList[k]
title = re.findall('<p align="center">(.*?)</p>', st, re.S)
if len(title) == 0:
title = '填充'
else:
title = parseString(title[0]) # Take only the first one
st = re.sub('<p align="center">(.*?)</p>', '', st, flags=re.S)
content = parse(parseString(st).split())
for m in range(0,len(content)):
content[m] = content[m].strip()
dictFinalPoem['title'] = title
dictFinalPoem['paragraphs'] = content
dictFinalPoem['id'] = dictAuthor['id']
poemList.append(dictFinalPoem)
print("Finish ",i)
json.dump(poemList,open(str(i) + '.json','w', encoding='gb18030'), ensure_ascii=False)
def text():
authorPoemPre = json.load(open('author.json', 'r'))
poemList = []
for i in range(0,len(authorPoemPre)):#len(authorPoemPre)
dictAuthor = authorPoemPre[i]
src = dictAuthor['src'] + '.htm'
poemHtml = s.get(link + '/' + src, headers=headers)
poemHtml.encoding = 'gb18030'
txt = poemHtml.text
pattern = re.compile("<hr />(.*?)<hr />",re.S)
tempHrList = findAll(pattern, txt)
for k in range(0,len(tempHrList)):
dictFinalPoem = {"author":dictAuthor['name']}
st = tempHrList[k]
title = re.findall('<p align="center">(.*?)</p>', st, re.S)
if len(title) == 0:
title = '填充'
else:
title = parseString(title[0]) # Take only the first one
st = re.sub('<p align="center">(.*?)</p>', '', st, flags=re.S)
content = parseString(st).replace(' ','。') + "。"
with open('analyze.txt', 'a', encoding='gb18030') as f:
f.write(content)
print("Finish ",i)
author() #生成作者信息
poem() #生成诗歌信息
text() #生成诗歌内容纯文本,用于后文的词云
爬取中还容易遇到的问题是服务器拒绝连接,这个就需要不断尝试了!
数据分析⌗
在这里,我进行了一个最简单的词云分析——分析哪些词在近代/现代/当代诗人的诗里出现频率高。
import csv
from wordcloud import WordCloud
import PIL.Image as image
import jieba
def get_stopwords_list():
stopwords = [line.strip() for line in open('stopwords.txt',encoding='gb18030').readlines()]
return stopwords
def csv_1():
with open('1.csv', encoding='utf-8')as f:
f_csv = csv.reader(f)
column = [row[3] for row in f_csv]
with open('csv1.txt', 'a', encoding='gb18030') as k:
for m in column:
k.write(m)
def csv_2():
with open('2.csv', encoding='utf-8')as f:
f_csv = csv.reader(f)
column = [row[3] for row in f_csv]
with open('csv2.txt', 'a', encoding='gb18030') as k:
for m in column:
k.write(m)
def trans_CN(text):
word_list = jieba.cut(text)
result = " ".join(word_list)
return result
def move_stopwords(sentence, stopwords_list):
for i in stopwords_list:
if i in sentence:
sentence.replace(i,'')
return sentence
def analyze(file):
with open(file,encoding="gb18030") as fp:
stopwords = get_stopwords_list()
text = fp.read()
text = trans_CN(text)
text = move_stopwords(text, stopwords)
wordcloud = WordCloud(background_color=(255,255,255),font_path = "C:\Windows\Fonts\simhei.ttf", width=1600,height=800).generate(text)
image_produce = wordcloud.to_image()
image_produce.save('cloud.png',quality=95,subsampling=0)
image_produce.show()
csv_1()
csv_2()
analyze('csv1.txt')
analyze('csv2.txt')
首先针对之前生成的 csv 文件,还需要再利用一下,生成纯文本的所有诗歌的数据。接着就是进行词云分析。词云分析主要包含两个内容:
- jieba 库分词
- wordcloud 生成词云
jieba 库是一个中文分词的库,它能够将中文句子分成词语,是一个很好的分词库。其中上文的 stopwords.txt
就是停用词——即在信息检索中需要过滤掉的词。因为这些词在诗歌当中可能就是一些没有意义的内容。停用词列表在网上可以很容易找到,找到之后放在这个 python 程序目录下即可。
wordcloud 库则可以被用来生成词云。
最后生成词云的效果: