CREATE TABLE rawhttplog
(
id
int(10) unsigned NOT NULL AUTO_INCREMENT,
tscreated
timestamp NOT NULL DEFAULT CURRENTTIMESTAMP ON UPDATE CURRENTTIMESTAMP,
status
int(10) unsigned DEFAULT NULL,
url
mediumtext,
requestHeadersText
mediumtext,
responseHeadersText
mediumtext,
responseBody
mediumtext,
responseContentType
varchar(1024) DEFAULT NULL,
responseBodyBase64Encoded
int(10) DEFAULT NULL,
UNIQUE KEY id_2
(id
),
id
(id
) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf32
NOTE: make sure you have the correct charset or you might get:Incorrect string value: '\xD8\xA7\xD9\x8...
Simply insert the following snippet that would route the data collected into the datastore defined above:
var modtask = function() {}
modtask.apiInterfaceType = 'jsonio';modtask.processQueries = function(queryObject, cb) {
var db = 'izycloudautoscale';modtask.doChain([
['nop'],function(_push) {
var obj = {status: queryObject.status,
url: queryObject.url,requestHeadersText: queryObject.requestHeadersText,
responseHeadersText: JSON.stringify(queryObject.responseHeaders),responseBody: queryObject.responseBody.body,
responseBodyBase64Encoded: queryObject.responseBody.base64Encoded ? 1 : 0};
if (queryObject.responseHeaders['Content-Type']) {obj.responseContentType = queryObject.responseHeaders['Content-Type'].toLowerCase();
}modtask.ldmod('rel:sql').processQueries([
modtask.ldmod('sql/q').getInsert(db + '.rawhttplog
', [obj])
], function(outcome) {
console.log(obj.url, outcome);cb(outcome);
});}
]);}
You can simply post to the traffic logger using the following snippet: i.e.:
function IzyRecord(jsonData) {
var xhr = new XMLHttpRequest();xhr.open("POST", 'http://
true);
// Send the proper header information along with the request// xhr.setRequestHeader("Content-type", "application+json");
xhr.onreadystatechange = function() {if(xhr.readyState == XMLHttpRequest.DONE && xhr.status == 200) {
// JSON.parse(xhr.responseText) will give you the data, but we are not interested in that}
}var data = jsonData;
xhr.send(JSON.stringify(data));}
So, for example if you are using the chrome live-headers toolbar:
if (message == "Network.responseReceived") {
chrome.debugger.sendCommand({tabId: debuggeeId.tabId
}, "Network.getResponseBody", {"requestId": params.requestId
}, function(response) {var izyObj = {
tscreated: 'NOQUOTEUTC_TIMESTAMP()',status: params.response.status,
url: params.response.url,requestHeaders: params.response.requestHeaders,
responseHeaders: params.response.headers,requestHeadersText: params.response.requestHeadersText,
responseBody: response};
IzyRecord(izyObj);});
}Now you are collecting data. To verify that raw data is being collected you may use a simple SQL query:
select tscreated, url, responseContentType, responseBody from rawhttplog
order by tscreated desc limit 10
If you have enterprise analytics enabled, you can do advanced queries to find patterns whithin the collected data.
You can pull the results into very large text files.
SET NAMES utf8;
SET groupconcatmax_len = 100000000000;set global maxallowedpacket = 10000000000000;
select GROUPCONCAT(y SEPARATOR ' ') into DUMPFILE 'fullpathtodumpfile' from (SELECT 1 as x, CONCAT('-- new entry ',id, ' ---- ', url, '-------------------', responseBody, '-------- id=', id, ' end ------') as y FROM subscriptionmanager.rawhttplog where responseContentType = 'text/html; charset=utf-8' and url LIKE 'https://urls-to-include%' and id > LASTEXTRACTEDITEM_ order by id,url asc limit 70) as t group by xCreate a node.js file that would use Tika to extract text from the payloads:
var tika = require('tika');
var options = {// Hint the content-type. This is optional but would help Tika choose a parser in some cases.
contentType: 'text/html; charset=utf-8'};
tika.text('fullpathtodumpfile', options, function(err, text) {console.log(text);
});rm fullpathtodumpfile;mysql -u USERNAME --password='PASSWORD' DBNAME < import.sql; node app.js >> data/all.txt; tail data/all.txt
The tail command should give you the LASTEXTRACTEDITEM which you should transfer back over to the import.sql
file.
Login
MapR Sandbox
Now load the data from the database into a Resilient Distributed Dataset (RDD).
val data = dbData.map(dbData.raw).cache()
data.first()In the next installment we will talk about training the Model.
Chrome: http://developer.chrome.com/extensions/examples/api/debugger/live-headers.zipFirefox: https