From aeba36f7f9e92fb6d4a4ce3da6b86eec26d05f8b Mon Sep 17 00:00:00 2001
From: dchem <jaey226@gmail.com>
Date: Mon, 10 Mar 2014 17:03:42 +0000
Subject: [PATCH] Init commit

---
 actorsByStates.js |  129 +++++++++++++++++++++++++++++++++++++++++++
 README.md         |   14 ++++
 2 files changed, 142 insertions(+), 1 deletions(-)

diff --git a/README.md b/README.md
index 41c9367..a2ddf8c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,16 @@
 actors-by-states
 ================
 
-Fun project to scrape Wikipedia to get birthplace of each US born actors
+Fun Node.js project to scrape Wikipedia to get birthplace of each US born actors
+
+you need cheerio and yagnus to run this
+```
+npm install cheerio
+npm install yagnus
+```
+then type in 
+```
+node actorsByStates.js
+```
+
+Copyright (c) 2014 Jae Yang. See LICENSE file for license rights and limitations (MIT).
diff --git a/actorsByStates.js b/actorsByStates.js
new file mode 100644
index 0000000..45f0022
--- /dev/null
+++ b/actorsByStates.js
@@ -0,0 +1,129 @@
+var EventEmitter = require('events').EventEmitter;
+var ehandler = new EventEmitter();
+var request = require('request'),
+  yagnus = require('yagnus'),
+  cheerio = require('cheerio'),
+  fs = require('fs'), 
+  urls = [];
+  birthStates = [];
+  usStates = [];
+var offset = 0;
+var limit = 1000;
+var pageLimit = 50;
+var dstat = org.yagnus.stats.initDiscreteSummaryStatistics(1);
+
+
+var requestStates = function () {
+request('http://en.wikipedia.org/wiki/List_of_US_States', function(err, resp, body) {
+  if(!err && resp.statusCode == 200) {
+    var $ = cheerio.load(body);
+
+    $('th').each( function(i, elem) {
+      if ( $(this).find('span').hasClass('flagicon') ) {
+        usStates[i] = $(this).find('a').attr('title');
+      }
+    });
+    
+    console.log("Number of US States / Territories: " + usStates.length);
+    ehandler.emit('loadedStates');
+  }
+}); 
+}
+
+
+var popActor = function () {
+  if (urls.length > 0) {
+    var actorname = urls.pop();
+    ehandler.emit('popped',actorname);
+  }
+  else {
+    
+    offset += pageLimit;
+    console.log('end: '+ offset);
+    
+    if (offset < limit) {
+      ehandler.emit('setComplete');
+    }
+    else if (offset >= limit) {
+      dstat.calc();
+      // get the count for each states        
+      usStates.forEach( function(ele,idx,arr) {
+      console.log(ele + ': '+ dstat.getCount(ele) );
+      });
+    }
+  }
+}
+
+var requestBirthPlace = function (name) {
+  if (name == undefined) {
+    ehandler.emit('birthplaceLoaded')
+    return;
+  }
+  request(name, function(errA, respA, bodyA) {
+    if(!errA && respA.statusCode == 200) {
+      var $$ = cheerio.load(bodyA);
+
+      $$('span').each( function(i, elem) {
+        if ( $$(this).hasClass('birthplace') ) {
+          var bstate;
+          if ($$(this).find('a').attr('title') != undefined) { 
+            bstate = $$(this).find('a').attr('title');
+          }
+          else {
+            bstate = $$(this).text();
+          }
+          
+          // compare to known state names
+          usStates.forEach( function(ele,idx,arr) {
+            if (bstate.indexOf(ele) != -1) {
+              dstat.inc(ele); 
+              //console.log(name + ' ' + ele);
+              return false;
+            }
+          });
+        }
+      });
+      ehandler.emit('birthplaceLoaded');
+    }
+  });
+}
+
+var requestActors = function () {
+// get actors
+request('http://en.wikipedia.org/w/index.php?title=Special:Search&limit='+ pageLimit + '&offset='+ offset +'&redirs=0&profile=default&search=%22American+actor%22', function(err, resp, body) {
+  if(!err && resp.statusCode == 200) {
+    var $ = cheerio.load(body);
+
+    $('li').each( function(i, elem) {
+      if ( $(this).find('div').hasClass('mw-search-result-heading') ) {
+        urls[i] = 'http://en.wikipedia.org' + $(this).find('div.mw-search-result-heading').find('a').attr('href');
+        
+      }
+    });
+    console.log(urls.length)
+    
+    ehandler.emit('loadedActors');
+
+  }
+}); // get actor names
+}
+
+ehandler.on('loadedStates', requestActors);
+ehandler.on('setComplete', requestActors);
+
+ehandler.on('loadedActors', popActor);
+ehandler.on('popped',requestBirthPlace);
+ehandler.on('birthplaceLoaded', popActor);
+process.on('SIGINT', function() {
+  console.log('Interrupted');
+    dstat.calc();
+    
+    // get the count for each states        
+    usStates.forEach( function(ele,idx,arr) {
+      console.log(ele + ': '+ dstat.getCount(ele) );
+    });
+  
+  process.exit();
+});
+
+requestStates();

--
Gitblit v1.8.0