本文共 3152 字,大约阅读时间需要 10 分钟。
想做个成语接龙,就爬取了某网站的所有成语及它的拼音。中间还出错了两次,浪费了一天的时间,加上try后就好了。
from lxml import etreeimport requestsfrom urllib.parse import urljoinimport reimport timeimport randomheaders = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, compress', 'Accept-Language': 'zh-CN,zh,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}s = requests.Session()s.headers.update(headers)url = 'http://chengyu.t086.com/'r = s.get(url=url)r.encoding = 'GBK'tree=etree.HTML(r.text)nodes=tree.xpath("/html/body/div[3]/a")urls = []words = []for href in nodes: if 'cy' not in href.xpath('@href')[0]: urls.append(urljoin(url,href.xpath('@href')[0]))for line in urls: try: time.sleep(0.3+random.random()) r = s.get(url=line) r.encoding = 'GBK' tree=etree.HTML(r.text) wordlist = tree.xpath('//div[@class="listw"]//a/@href') for word in wordlist: wordurl = urljoin(url,word) time.sleep(0.3+random.random()) try: r2 = s.get(url = wordurl) r2.encoding = 'GBK' tree2=etree.HTML(r2.text) word = tree2.xpath('//*[@id="main"]//h1/text()')[0] pingyin = re.split(r'\s',tree2.xpath('//*[@id="main"]//td[text() = "发音"]/following-sibling::td[1]/text()')[0]) print(word,pingyin) words.append((word,pingyin)) except: print(' Wrong url:'+wordurl) nextUrl = tree.xpath("//a[text() = '下一页']") if nextUrl: x,y = line.split('_') y1 = int(y.split('.')[0]) newline = x + '_' + str(y1+1) + '.html' urls.append(newline) except: print('first wrong url',line)with open('words.txt','w') as f: for term in words: print(term,file=f)
爬出来的成语是这样的
有3万多个成语,不想自己爬的话,有链接直接下载:(密码0v9i)
有了成语,接下来就是做个接龙程序了。
思路:
1、判断是否成语。
2、为了减少搜索量,先收集同音字。3、在同音字中判断是否有相同汉字,如果有,返回相同汉字成语,否则返回同音字成语。
import randomwords = [eval(i.strip()) for i in open('words.txt','r')]# 显示太多了也看不过来,加个计数count = 0inputWord = ''while True: spell = [] theword = '' if inputWord == '' or count == 10: inputWord = input('请输入一个成语:\n') count = 0 if inputWord == 'q': break # 判断成语库中是否含有 for word in words: if word[0] == inputWord: for w in words: # 比较拼音,相同拼音收入spell中 if word[1][-1] == w[1][0]: spell.append(w) if spell: same = [] count += 1 for a in spell: # 在相同拼音的情况下,比较首尾汉字是否相同 if inputWord.endswith(a[0]): same.append(a[0]) if same: # 增加点乐趣,加个随机程序 theword = random.choice(same) else: theword = random.choice(spell)[0] print(theword+'\n') break # 如果有的话,所匹配的成语默认为输入。 inputWord = theword
结果如下: