搜索⽂档树

    科技2022-09-04  152

    # @ Time : 2020/10/5 19:08 # @ Author : Ellen from bs4 import BeautifulSoup html_doc = """ <table class="tablelist" cellspacing="0" cellpadding="0"> <tbody> <tr class="h"> <td class="1" width="374">职位名称</td> <td>职位类别</td> <td>⼈数</td> <td>地点</td> <td>发布时间</td> </tr> <tr class="even"> <td class="l"><a href="https://www.baidu.com">区块链⾼级研发⼯</a></td> <td class="l">技术类</td> <td class="l">1</td> <td class="l">深圳</td> <td class="l">2018-11-25</td> </tr> <tr class="even"> <td><a href="https://www.qq.com">⾦融云⾼级后台开发</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2018-11-24</td> </tr> <tr> <td><a href="https://www.juran.com">⾼级研发⼯程师</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2018-11-24</td> </tr> <tr> <td><a href="https://www.python.com">⾼级图像算法⼯程师</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2018-11-24</td> </tr> <tr> <td><a href="https://www.lg.com" id="test" class="test">⾼级业务运维工程师</a><td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2018-11-24</td> </tr> </tbody> </table> """ soup = BeautifulSoup(html_doc, 'lxml') # 1.获取所有tr标签 # trs = soup.find_all('tr') # 返回的是list # print(trs) # for tr in trs: # print(tr) # print("="*30) # 2.获取第2个tr标签 # tr = soup.find_all('tr', limit=2)[1] # print(tr) # 3.获取所有class等于even的tr标签 # trs = soup.find_all('tr', class_='even') # trs = soup.find_all('tr', attrs={"class": 'even'}) # for tr in trs: # print(tr) # print("="*30) # 4.将所在id等于test,class也等于test的a标签提取出来 # alist = soup.find_all('a', attrs={"id": "test", "class": "test"}) # alist = soup.find_all('a', id="test", class_="test") # print(alist) # 5.获取所有的a标签的href属性 # hrefs = soup.find_all("a") # for a in hrefs: # print(a) # 1.通过下标的方式 # href = a['href'] # 用get如果没有取到返回是None 另外两种报错 # href = a.get('href') # print(href) # 2.通过attrs属性的方式 # href = a.attrs['href'] # print(href) # 6.获取所有的职位信息(纯文本) trs = soup.find_all('tr')[1:] for tr in trs: # print(tr) # tds = tr.find_all('td') # print(tds) # title = tds[0] # print(title.string) # 获取tr标签的所有文本 # infos = list(tr.strings) # print(infos) # 可以去除空格取纯文本 infos = list(tr.stripped_strings) print(infos[0])
    Processed: 0.008, SQL: 9