时间:2022-08-06 22:12:01 | 来源:网站运营
时间:2022-08-06 22:12:01 来源:网站运营
最后执行下面命令 ```shell scriptgo mod tidygo mod vendorgo build
编译结束后,配置config。重命名config.dist.json为config.json,打开config.json,修改mysql部分的配置,填写为你的mysql地址、用户名、密码、数据库信息,新建cobweb数据库,导入mysql.sql到填写的数据库中,然后双击运行可执行文件即可开始采集之旅。{ "mysql": { //数据库配置 "Database": "spider", "User": "root", "Password": "root", "Charset": "utf8mb4", "Host": "127.0.0.1", "TablePrefix": "", "Port": 3306, "MaxIdleConnections": 1000, "MaxOpenConnections": 100000 }}
var MaxChan = 100var waitGroup sync.WaitGroupvar ch = make(chan string, MaxChan)func SingleSpider(){ var websites []Website var counter int DB.Model(&Website{}).Where("`status` = 0").Limit(MaxChan*10).Count(&counter).Find(&websites) if counter > 0 { for _, v := range websites { ch <- v.Domain waitGroup.Add(1) go SingleData2(v) } } else { log.Println("等待数据中,10秒后重试") time.Sleep(10 * time.Second) } SingleSpider()}
//锁定当前数据 DB.Model(&website).Where("`id` = ?", website.ID).Update("status", 2) log.Println(fmt.Sprintf("开始采集:%s://%s", website.Scheme, website.Domain)) err := website.GetWebsite() if err == nil { website.Status = 1 } else { website.Status = 3 } log.Println(fmt.Sprintf("入库2:%d:%s",website.ID, website.Domain)) DB.Save(&website)
DB.Exec("insert into website(`domain`, `scheme`,`title`) select ?,?,? from dual where not exists(select id from website where `domain` = ?)", v.Domain, v.Scheme, v.Title, v.Domain)
contentType := strings.ToLower(resp.Header.Get("Content-Type")) log.Println(contentType) var htmlEncode string if contentType == "" { //先尝试读取charset reg := regexp.MustCompile(`(?is)charset=["']?/s*([a-z0-9/-]+)`) match := reg.FindStringSubmatch(body) if len(match) > 1 { htmlEncode = strings.ToLower(match[1]) if htmlEncode != "utf-8" && htmlEncode != "utf8" { body = ConvertToString(body, "gbk", "utf-8") } } else { reg = regexp.MustCompile(`(?is)<title[^>]*>(.*?)<//title>`) match = reg.FindStringSubmatch(body) if len(match) > 1 { aa := match[1] _, htmlEncode, _ = charset.DetermineEncoding([]byte(aa), "") if htmlEncode != "utf-8" { body = ConvertToString(body, "gbk", "utf-8") } } } } else if !strings.Contains(contentType, "utf-8") { body = ConvertToString(body, "gbk", "utf-8") }
//尝试获取微信 reg := regexp.MustCompile(`(?i)(微信|微信客服|微信号|微信咨询|微信服务)/s*(:|:|/s)/s*([a-z0-9/-_]{4,30})`) match := reg.FindStringSubmatch(contentText) if len(match) > 1 { website.WeChat = match[3] } //尝试获取QQ reg = regexp.MustCompile(`(?i)(QQ|QQ客服|QQ号|QQ号码|QQ咨询|QQ联系|QQ交谈)/s*(:|:|/s)/s*([0-9]{5,12})`) match = reg.FindStringSubmatch(contentText) if len(match) > 1 { website.QQ = match[3] } //尝试获取电话 reg = regexp.MustCompile(`([0148][1-9][0-9][0-9/-]{4,15})`) match = reg.FindStringSubmatch(contentText) if len(match) > 1 { website.Cellphone = match[1] }
关键词:信息,访问,获取,把手