More analytics work

This commit is contained in:
Jocelyn Badgley (Twipped) 2021-03-05 20:23:31 -08:00
parent 1cc9c88a3a
commit 514acab9ee
10 changed files with 130 additions and 23 deletions

View File

@ -2,11 +2,11 @@
"extends": "twipped/node",
"rules": {
"node/no-unpublished-require": 0,
'indent': [ 2, 2, {
'MemberExpression': 1,
"indent": [ 2, 2, {
"MemberExpression": 1
} ],
'node/no-unsupported-features/es-syntax': [ 'error' ],
'node/no-unsupported-features/es-builtins': [ 'error' ],
'node/no-unsupported-features/node-builtins': [ 'error' ],
"node/no-unsupported-features/es-syntax": [ "error" ],
"node/no-unsupported-features/es-builtins": [ "error" ],
"node/no-unsupported-features/node-builtins": [ "error" ]
}
}

View File

@ -129,15 +129,14 @@ async function* loadFiles () {
readableObjectMode: true,
writableObjectMode: true,
transform (row, encoding, done) {
console.log(row);
// filter out OPTIONS calls
if (row['cs-method'] === 'OPTIONS') return null;
if (row['cs-method'] === 'OPTIONS') return done();
// I only care about the pixel hits, nothing else.
if (row['cs-uri-stem'] !== '/i') return null;
if (row['cs-uri-stem'] !== '/i') return done();
// this isn't an analytics event
if (row['cs-referer'] === '-') return null;
if (row['cs-referer'] === '-') return done();
row = Object.fromEntries(Object.entries(row).map(([ k, v ]) => [ k.replace(/-/g, '_'), v ]));
@ -147,7 +146,7 @@ async function* loadFiles () {
;
// we didn't get analytics data from this load, ignore it
if (!query.start) return null;
if (!query.start) return done();
const useragent = parseUA(row.cs_user_agent);

View File

@ -5,7 +5,7 @@ data "aws_region" "current" {}
resource "aws_cloudwatch_log_group" "ipixel_results" {
name = "/aws/ipixel/${var.site}"
retention_in_days = 30
retention_in_days = 90
tags = {
Site = var.site,

View File

@ -1,3 +1,12 @@
{
"extends": "airbnb-base"
"extends": "twipped/node",
"rules": {
"node/no-unpublished-require": 0,
"indent": [ 2, 2, {
"MemberExpression": 1
} ],
"node/no-unsupported-features/es-syntax": [ "error" ],
"node/no-unsupported-features/es-builtins": [ "error" ],
"node/no-unsupported-features/node-builtins": [ "error" ]
}
}

View File

@ -0,0 +1,2 @@
module.exports = exports = require('./src/index');

View File

@ -305,6 +305,11 @@
"which": "^2.0.1"
}
},
"date-fns": {
"version": "2.18.0",
"resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.18.0.tgz",
"integrity": "sha512-NYyAg4wRmGVU4miKq5ivRACOODdZRY3q5WLmOJSq8djyzftYphU7dTHLcEtLqEvfqMKQ0jVv91P4BAwIjsXIcw=="
},
"debug": {
"version": "4.3.1",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz",
@ -1547,6 +1552,11 @@
"integrity": "sha512-4dbzIzqvjtgiM5rw1k5rEHtBANKmdudhGyBEajN01fEyhaAIhsoKNy6y7+IN93IfpFtwY9iqi7kD+xwKhQsNJA==",
"dev": true
},
"ua-parser-js": {
"version": "0.7.24",
"resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.24.tgz",
"integrity": "sha512-yo+miGzQx5gakzVK3QFfN0/L9uVhosXBBO7qmnk7c2iw1IhL212wfA3zbnI54B0obGwC/5NWub/iT9sReMx+Fw=="
},
"uri-js": {
"version": "4.4.1",
"resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",

View File

@ -2,7 +2,9 @@
"name": "cloudfront-logs",
"version": "0.0.1",
"dependencies": {
"aws-sdk": "*"
"aws-sdk": "*",
"date-fns": "~2.18.0",
"ua-parser-js": "~0.7.24"
},
"devDependencies": {
"eslint": "*",
@ -11,5 +13,8 @@
},
"scripts": {
"lint": "eslint ."
},
"engines": {
"node": ">=12.14.0"
}
}

View File

@ -2,9 +2,20 @@ const { gunzip } = require('zlib');
const { promisify } = require('util');
const { S3 } = require('aws-sdk');
const { unescape } = require('querystring');
const parseUA = require('ua-parser-js');
const format = require('date-fns/format');
const { URL } = require('url');
const gunzipAsync = promisify(gunzip);
function url (input) {
try {
const { hash, host, hostname, href, origin, password, pathname, port, protocol, search, searchParams, username } = new URL(input); // eslint-disable-line max-len
return { hash, host, hostname, href, origin, password, pathname, port, protocol, search, searchParams, username };
} catch (e) {
return null;
}
}
// Parsing the line containing the version.
//
@ -31,7 +42,7 @@ const parseFields = (line) => {
if (!line.startsWith('#Fields:')) {
throw new Error(`Invalid fields line '${line}'`);
} else {
return line.match(/[\w()-]+(\s|$)/g).map(field => (
return line.match(/[\w()-]+(\s|$)/g).map((field) => (
// Strip parentheses and remove unecessary abbreviations in field names
field.replace(/\(([^)]+)\)/, '-$1').replace(/^(c-|cs-|sc-)/, '').trim().toLowerCase()
));
@ -41,7 +52,7 @@ const parseFields = (line) => {
// Unescape value twice (because fuck you that's why).
// https://forums.aws.amazon.com/thread.jspa?threadID=134017
//
const decode = value => unescape(unescape(value));
const decode = (value) => unescape(unescape(value));
// Split up line and assign to corresponding field.
//
@ -49,11 +60,76 @@ const parseLine = (line, fields) => {
if (line.startsWith('#')) {
throw new Error(`Invalid log line '${line}'`);
} else {
return line.split('\t').reduce((object, section, index) => {
let row = line.split('\t').reduce((object, section, index) => {
const result = object;
if (section !== '-') result[fields[index]] = decode(section); // Skip missing fields
return result;
}, {});
// filter out OPTIONS calls
if (row.method === 'OPTIONS') return;
// I only care about the pixel hits, nothing else.
if (row['uri-stem'] !== '/i') return;
// this isn't an analytics event
if (!row.referer) return;
row = Object.fromEntries(Object.entries(row).map(([ k, v ]) => [ k.replace(/-/g, '_'), v ]));
const query = (row.uri_query)
? Object.fromEntries(new URLSearchParams(row.uri_query))
: {}
;
const useragent = parseUA(row.user_agent);
const sessionStart = Number(query.start);
const sessionEnd = query.end === 'null' ? 0 : Number(query.end);
const duration = sessionEnd > sessionStart ? Math.floor((sessionEnd - sessionStart) / 1000) : null;
let {
language,
viewed,
max_scroll,
page_height,
viewport_height,
} = query;
max_scroll = parseInt(max_scroll, 10) || 0;
page_height = parseInt(page_height, 10) || 0;
viewport_height = parseInt(viewport_height, 10) || 0;
const { pathname } = url(row.referer) || {};
const { hostname: referrer_host, href: referrer } = url(query.referrer) || {};
const result = {
dts: `${row.date} ${row.time}`,
ip: row.ip,
tid: query.tid !== 'false' ? query.tid : null,
url: pathname,
referrer,
referrer_host,
client_start: format(new Date(sessionStart), 'yyyy-MM-dd HH:mm:ss'),
client_end: sessionEnd ? format(new Date(sessionStart), 'yyyy-MM-dd HH:mm:ss') : null,
duration,
language,
viewed,
max_scroll,
page_height,
viewport_height,
browser: useragent.browser.name,
browser_version: useragent.browser.major,
os: useragent.os.name + ' ' + useragent.os.version,
device_type: useragent.device && useragent.device.type || null,
device: useragent.device && useragent.device.vendor && useragent.device.vendor + ' ' + useragent.device.model || null,
useragent,
query,
original: row,
};
return result;
}
};
@ -79,5 +155,9 @@ exports.parseLogFile = async ({ bucket, key, region }) => {
// Shift next line containing fields format and parse it for validation
const fields = parseFields(lines.shift());
return lines.map(line => parseLine(line, fields));
console.log(`Found ${lines.length} rows to parse`); // eslint-disable-line no-console
const rows = lines.map((line) => parseLine(line, fields)).filter(Boolean);
console.log(`Produced ${rows.length} results`);
console.log('Sample', rows[0]);
return rows;
};

View File

@ -15,7 +15,7 @@ const groupBy = (array, key) => (
if (result[item[key]]) {
result[item[key]].push(item);
} else if (item[key]) {
result[item[key]] = [item];
result[item[key]] = [ item ];
}
return result;
}, {})
@ -54,7 +54,7 @@ const describeLogStream = async (logStreamName) => {
// Extend the original record with some additional fields
// and encapsule records into CloudWatch Logs event.
//
const buildlogEvents = records => (
const buildlogEvents = (records) => (
records.map((record) => {
const payload = record;
payload.name = 'logs:cloudfront';

View File

@ -31,7 +31,7 @@ resource "aws_s3_bucket_notification" "ipixel_logs" {
data "archive_file" "ipixel_parser" {
type = "zip"
source_dir = "${path.module}/lambda/src"
source_dir = "${path.module}/lambda"
output_path = ".terraform/tmp/lambda/ipixel_parser.zip"
}
@ -40,8 +40,7 @@ resource "aws_lambda_function" "ipixel_parser" {
runtime = "nodejs12.x"
handler = "index.handler"
timeout = "24"
memory_size = "512"
timeout = 5
reserved_concurrent_executions = 3
environment {
@ -60,5 +59,8 @@ resource "aws_lambda_function" "ipixel_parser" {
Role = "ipixel"
}
depends_on = [aws_cloudwatch_log_group.ipixel_parser_logs]
depends_on = [
aws_cloudwatch_log_group.ipixel_parser_logs,
aws_cloudwatch_log_group.ipixel_results,
]
}