第三步采集 分页列表数据到并且入库
差不多采集了5000多条数据,速度很快
package main
import (
"fmt"
"database/sql"
_ "github.com/go-sql-driver/mysql"
"strings"
"log"
"github.com/PuerkitoBio/goquery"
"io/ioutil"
"os"
"strconv"
)
var (
//获取数据
topicinfo = []string{"", "", "", "", ""}
)
type DbWorker struct {
//mysql data source name
Dsn string
}
func main () {
//连接MySQL
dbw := DbWorker{
Dsn: "root:xiaohai123@tcp(localhost:3306)/test?charset=utf8",
}
db, err := sql.Open("mysql",
dbw.Dsn)
defer db.Close()
if err != nil {
panic(err)
} else {
fmt.Println("数据库链接成功!")
}
dataPath := "data/city_site.txt"
content, err := ioutil.ReadFile(dataPath)
if err != nil {
panic(err)
}
//读取所有的内容
str := string(content)
citys := strings.Split(str, "\n")
//保存所有city映射地址的连接
cityUrls := map[string]string{}
for _, city := range citys {
cityUrl := strings.Split(city, " ")
cityUrls[cityUrl[1]] = cityUrl[2]+"/company/"
}
var index int = 0
var pagelasturl string = ""
fd, _ := os.OpenFile("data/city_company_with_page.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644)
for _, cityUrl := range cityUrls {
pageurl := cityUrl+"?p="
for {
index++
temp := strconv.Itoa(index)
pagelasturl = pageurl + temp
doc, _ := goquery.NewDocument(pagelasturl)
h := doc.Find(".page a").Last().Text()
if h != "下一页" {
index = 0
fmt.Println("采集最后一页"+pagelasturl)
getpageinfo(db, pagelasturl)
fd.WriteString(pagelasturl + "\n") //其他分页先入库
break
}
if index == 1{
fmt.Println("采集到本地"+cityUrl)
getpageinfo(db, cityUrl)
fd.WriteString(cityUrl + "\n") //其他分页先入库
}else{
fmt.Println("采集到本地"+pagelasturl)
getpageinfo(db, pagelasturl)
fd.WriteString(pagelasturl + "\n") //其他分页先入库
}
temp = ""
pagelasturl = ""
}
}
}
//采集入库
func getpageinfo(db *sql.DB, pagefullurl string) {
doc, err := goquery.NewDocument(pagefullurl)
if err != nil {
log.Fatal(err)
}
doc.Find(".gongslist li").Each(func(i int, s *goquery.Selection) {
//帖子标题
source_url,_:= s.Find(".company-link").Eq(0).Attr("href")
topicinfo[0] = source_url
topicinfo[1] = pagefullurl
//消除各种不兼容字符
topicinfo = splitstring(topicinfo)
fmt.Println(topicinfo)
//数据库操作
dbmanager(db, topicinfo)
})
}
func splitstring(pageinfo []string) []string {
spilitinfo := pageinfo
for i := 0; i < 2; i++ {
spilitinfo[i] = strings.Replace(pageinfo[i], "'''", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "'", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "''", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "’", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "‘", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "“", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "”", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], ",", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "?", " ", -1)
}
return spilitinfo
}
func dbmanager(db *sql.DB, info []string) {
// fmt.Print(info[0])
var sqlinfo string = "INSERT INTO qizhuang_company (source_url,at_list_url) VALUES('" + info[0] + "','" + info[1] + "')"
stmt, err := db.Prepare(sqlinfo)
if err != nil {
fmt.Println("insert data error: %v\n", err)
return
}
stmt.Exec()
}点击查看更多内容
为 TA 点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦