最近在研究golang多线程(指定线程数)爬虫,并且亲手写了个可以模拟ua,模拟cookie,指定线程数量的golang语言版本爬虫,以下是代码实例:
最近在研究golang多线程(指定线程数)爬虫,并且亲手写了个可以模拟ua,模拟cookie,指定线程数量的golang语言版本爬虫,以下是代码实例:
package main
import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
"math/rand"
"time"
"os"
)
var cookie string ="in_userid=1; in_username=xxxxxqq.com; in_userpassword=xxxxx; in_adminid=3; in_adminname=xxxxxqq.com; in_adminpassword=xxxxx; in_permission=1%2C2%2C3%2C4%2C5%2C6"
var userAgentList = []string{"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
}
var count int
type Task struct {
myfun func()error
}
func NewTask(fun func()error)*Task {
return &Task{
myfun: fun,
}
}
func (m*Task)Excute() {
m.myfun()
}
type GroutinePool struct {
Sum int
EntryTask chan *Task
WorkTask chan *Task
}
func NewGroutinePool(sum int)*GroutinePool {
return &GroutinePool{
Sum: sum,
EntryTask: make(chan *Task),
WorkTask: make(chan *Task),
}
}
func (pool*GroutinePool)Work(workid int) {
for task:=range pool.WorkTask{
task.Excute()
count++
fmt.Println("第几个线程执行完成",workid,"执行次数",count)
}
}
func (pool*GroutinePool)Run() {
for i:=0;i<pool.Sum;i++{
go func() {
pool.Work(i)
}()
}
for task:=range pool.EntryTask{
pool.WorkTask<-task
}
}
func GetRandomUserAgent() string {
r := rand.New(rand.NewSource(time.Now().UnixNano()))
return userAgentList[r.Intn(len(userAgentList))]
}
func getUrlRespHtml() string {
rand.Seed(time.Now().UnixNano())
url := "http://gyb.chixnaxixnge.cn/mryens/"+randomString(5)+"/"
client := &http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err!=nil {
return "错误"
}
req.Header.Add("User-Agent",GetRandomUserAgent() )
resp, err := client.Do(req)
if err!=nil {
return "错误"
}
resp_byte, err := ioutil.ReadAll(resp.Body)
if err!=nil {
return "错误"
}
defer resp.Body.Close()
respHtml := string(resp_byte)
return respHtml
}
func Initialize(){
reader := getUrlRespHtml()
Reg := `<meta name="keywords" content="(.*?)">`
rp1 := regexp.MustCompile(Reg)
heads := rp1.FindAllStringSubmatch(reader, -1)
//打开文件
fo,err:=os.OpenFile("baidu.txt",os.O_APPEND,6)
if err!=nil{
fmt.Println("os.OpenFile err",err)
return
}
for _, v := range heads {
fo.Write([]byte(v[1]+"\r\n"))
fmt.Println(v[1])
}
}
// Returns an int >= min, < max
func randomInt(min, max int) int {
return min + rand.Intn(max-min)
}
// Generate a random string of A-Z chars with len = l
func randomString(len int) string {
bytes := make([]byte, len)
for i := 0; i < len; i++ {
bytes[i] = byte(randomInt(65, 90))
}
return string(bytes)
}
func testwork() error {
Initialize()
return nil
}
func main() {
p:=NewGroutinePool(100)
go func() {
for {
p.EntryTask<-NewTask(testwork)
}
}()
p.Run()
}main函数里的
p:=NewGroutinePool(100)
代表是开启100线程爬数据
这个是爬虫主业务函数
Initialize()
,其他的是线程池实现代码和随机字符函数
最后来一张运行golang多线程爬虫过程截图:

转载请注明来自本站(66娱乐网)
66优乐网 » golang多线程爬虫实例
66优乐网 » golang多线程爬虫实例