본문 바로가기

JS/NodeJS

취업 사이트 크롤링 Crawling

취업 사이트를 모아 볼 수 있게 cheerio 모듈을 써서 웹 페이지 없이 DB에 저장되게 했다.

https://github.com/copyNdpaste/crawler_IT_recruitment_info-Node-JS


generalUrls가 실행되면서 urls array에 페이지가 적힌 url들을 넣는다.

wizard가 실행되면서 urls에 들어있던 url이 앞에서부터 하나씩 나온다. shift()함수 사용.

url을 하나씩 scraper.js 파일을 모듈화한 Scraper 객체에 전달한다. Scraper는 전달받은 url과 함께 초기화를 시작한다.

loadWebPage()에서는 전달된 url에서 데이터를 꺼내 body 변수에 붙인다. 'loaded'를 emit하면서 콜백함수에 결과 값으로 body를 전달한다. on에서 parsePage를 실행한다. cheerio를 써서 전달받은 데이터들을 $변수에 넣는다. 여기서 각 url의 구조를 보고 태그, 클래스, 아이디 등을 가지고 원하는 데이터를 추출한다. 데이터 추출이 끝나면 model 객체에 값을 전달하고 'complete'를 emit한다. index.js에서 'complete'를 on하면 Model 객체에 저장할 값을 넣고 .save로 저장한다. 다음 url에 대한 데이터를 parsing, save하기 위해 wizard()를 호출한다.


index.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
var Model=require('./model');
var Scraper=require('./scraper');
var Pages=[];
 
function generateUrls(limit){
    var jobkourl="http://www.jobkorea.co.kr/Starter/?JoinPossible_Stat=0&schPart=%2C%2C10016%2C%2C&schOrderBy=0&LinkGubun=0&LinkNo=0&schType=0&schGid=0&Page=";
    var saramurl="http://www.saramin.co.kr/zf_user/jobs/list/job-category?page=";
    //var incruiturl="http://job.incruit.com/jobdb_list/searchjob.asp?ct=1&ty=1&cd=150&crr=1&jobty=1,2,4&page=";
    var urls=[];
    var jobko=[];
    var saram=[];
    //var incruit=[];
    var i;
    
    for (i=1;i<=limit;i++){//사람인 직무:IT
        jobko.push(jobkourl+i); 
       
        //incruit.push(incruiturl+i);
        saram.push(saramurl+i+'&cat_key=40413%2C40421%2C40407%2C40430%2C40420%2C40721%2C40751%2C40745%2C41604%2C41611&exp_cd=1&search_optional_item=y&isAjaxRequest=0&page_count=50&sort=RL&type=job-category#searchTitle');
    }
    urls=jobko.concat(saram);
    //urls=urls.concat(incruit);
    return urls;
}
//store all urls in a global variable
Pages=generateUrls(9); //1~10p
 
function wizard(){
    //if the pages array is empty, done
    if(!Pages.length){
        console.log('done');
        return;
    }
    var url=Pages.shift();
    console.log(url);
 
    var scraper=new Scraper(url);//scraper 구조체의 객체 생성 후 url 넘김
    var model;
    console.log('Requests Left: '+Pages.length);//남은 페이지 수
    
    //despite error occur call function for crawling
    scraper.on('error',function(err){
        console.log(err);
        wizard();
    });
    //save datas
    scraper.on('complete',function(listing){
        console.log('model save');
        model=new Model(listing);
        model.save(function(err){
            if(err){
                console.log('DB err:'+url);
            }
        });
        wizard();
    });
}
 
var numberOfParallelRequests = 200;
for (var i = 0; i < numberOfParallelRequests; i++) {
  wizard();
}
cs


model.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
var mongoose=require('mongoose');
 
mongoose.connect('mongodb://localhost:27017/scraper',{useNewUrlParser:true});
mongoose.connection.on('error',function(){
    console.error('MongoDB Connection Error. Make sure MongoDB is running');
});
 
var ListingsSchema=new mongoose.Schema({
    companyname:String,
    title:String,
    link:String,
    field:Array,
    career:String,
    levOfEdu:String,
    area:String,
    deadline:String,
    from:String
});
 
module.exports=mongoose.model('Listings',ListingsSchema);
cs


scraper.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
var http=require('http');
var cheerio=require('cheerio');
var util=require('util');
var EventEmitter=require('events').EventEmitter;
var STATUS_CODES=http.STATUS_CODES;
 
var Iconv = require('iconv').Iconv;
var request = require('request');
var iconv = new Iconv('EUC-KR''utf-8//translit//ignore');
 
//Scraper Constructor
function Scraper(url){
    this.url=url;
    this.init();
}
 
util.inherits(Scraper,EventEmitter);
 
//Initialize Scraping
Scraper.prototype.init=function(){
    var model;
    var self=this;
    self.on('loaded',function(html){
        self.parsePage(html);//페이지 해석
    });
    self.loadWebPage();
};
 
Scraper.prototype.loadWebPage=function(){
    var self=this;
    
    //console.log('\n\nLoading '+website);
    http.get(self.url, function(res){
        var body='';
        if(res.statusCode!==200){
            return self.emit('error',STATUS_CODES[res.statusCode]);
        }
        res.on('data',function(chunk){
            body+=chunk;
        });
        res.on('end',function(){
            self.emit('loaded',body);
        });
    })
    .on('error',function(err){
        self.emit('error',err);
    });
};
 
//Parse html and return an object
Scraper.prototype.parsePage=function(html){
    var $=cheerio.load(html);
    var self=this;
 
    if($('body').attr('class')=='modalOpenBd best1000'){//jobkorea
        $('.filterList li').each(function(i,li){
            var companyname=$(li).find('.co .coTit a').text();
            var title=$(li).find('.info .tit a').attr('title');
            var link='http://www.jobkorea.co.kr'+$(li).find('.info .tit a').attr('href');
            console.log(link);
            var len=$(li).find('.info .sTit span').length;
            var field=[];
            for(var i=0;i<len;i++){
                field[i]=$(li).find('.info .sTit span').eq(i).text();
            }
            var career=$(li).find('.sDesc strong').text();
            var levOfEdu=$(li).find('.sDesc span').eq(0).text();
            var area=$(li).find('.sDesc span').eq(1).text();
            var deadline=$(li).find('.side .day').text();
            var model={
                companyname:companyname,
                title:title,
                link:link,
                field:field,
                career:career,
                levOfEdu:levOfEdu,
                area:area,
                deadline:deadline,
                from:'jobkorea'
            };
            self.emit('complete',model);
        }); 
    }else if($('body').attr('class')=='has_lnb'){//saramin
        $('.recruiting_list tbody tr').each(function(i,tr){
            var companyname=$(tr).find('.company_nm .str_tit span').text();
            var title=$(tr).find('.notification_info .job_tit .str_tit span').text();
            var link='http://www.saramin.co.kr'+$(tr).find('.notification_info .job_tit a').attr('href');
            console.log(link);
            var len=$(tr).find('.notification_info .job_sector span').length;
            var field=[];
            for(var i=0;i<len;i++){
                field[i]=$(tr).find('.notification_info .job_sector span').eq(i).text();
            }
            var career=$(tr).find('.recruit_condition .career').text();
            var levOfEdu=$(tr).find('.recruit_condition .education').text();
            var area=$(tr).find('.company_info .work_place').text();
            var deadline=$(tr).find('.support_info .deadlines').text();
            
            var model={
                companyname:companyname,
                title:title,
                link:link,
                field:field,
                career:career,
                levOfEdu:levOfEdu,
                area:area,
                deadline:deadline,
                from:'saramin'
            };
            self.emit('complete',model);
        });
    }
    /*else{//incruit
        console.log('incruit');
        $('.list_full_default table tbody tr').each(function(i,tr){
            var companyname=$(tr).find('th .companys .check_list_r .links a').attr('title');
            //console.log(companyname);
            
            
            var model={
                companyname:companyname,
                from:'incruit'
            }
            self.emit('complete',model);
            /*var title=$(tr).find('.notification_info .job_tit .str_tit span').text();
            var len=$(tr).find('.notification_info .job_sector span').length;
            var field=[];
            for(var i=0;i<len;i++){
                field[i]=$(tr).find('.notification_info .job_sector span').eq(i).text();
            }
            var career=$(tr).find('.recruit_condition .career').text();
            var levOfEdu=$(tr).find('.recruit_condition .education').text();
            var area=$(tr).find('.company_info .work_place').text();
            var deadline=$(tr).find('.support_info .deadlines').text();
            
            var model={
                companyname:companyname,
                title:title,
                field:field,
                career:career,
                levOfEdu:levOfEdu,
                area:area,
                deadline:deadline,
                from:'incruit'
            };
            self.emit('complete',model);
        });
    }*/
    
};
module.exports=Scraper;
cs



크롤링한 데이터 조회하는 사이트 만들기 -> http://oneshottenkill.tistory.com/312