diff --git a/.eslintrc b/.eslintrc index 0eea9ea..b031bb5 100644 --- a/.eslintrc +++ b/.eslintrc @@ -2,11 +2,11 @@ "extends": "twipped/node", "rules": { "node/no-unpublished-require": 0, - 'indent': [ 2, 2, { - 'MemberExpression': 1, + "indent": [ 2, 2, { + "MemberExpression": 1 } ], - 'node/no-unsupported-features/es-syntax': [ 'error' ], - 'node/no-unsupported-features/es-builtins': [ 'error' ], - 'node/no-unsupported-features/node-builtins': [ 'error' ], + "node/no-unsupported-features/es-syntax": [ "error" ], + "node/no-unsupported-features/es-builtins": [ "error" ], + "node/no-unsupported-features/node-builtins": [ "error" ] } } diff --git a/analytics/index.js b/analytics/index.js index 52873a2..ef41ff0 100644 --- a/analytics/index.js +++ b/analytics/index.js @@ -129,15 +129,14 @@ async function* loadFiles () { readableObjectMode: true, writableObjectMode: true, transform (row, encoding, done) { - console.log(row); // filter out OPTIONS calls - if (row['cs-method'] === 'OPTIONS') return null; + if (row['cs-method'] === 'OPTIONS') return done(); // I only care about the pixel hits, nothing else. - if (row['cs-uri-stem'] !== '/i') return null; + if (row['cs-uri-stem'] !== '/i') return done(); // this isn't an analytics event - if (row['cs-referer'] === '-') return null; + if (row['cs-referer'] === '-') return done(); row = Object.fromEntries(Object.entries(row).map(([ k, v ]) => [ k.replace(/-/g, '_'), v ])); @@ -147,7 +146,7 @@ async function* loadFiles () { ; // we didn't get analytics data from this load, ignore it - if (!query.start) return null; + if (!query.start) return done(); const useragent = parseUA(row.cs_user_agent); diff --git a/terraform/cloudwatch-logs.tf b/terraform/cloudwatch-logs.tf index 5812c1a..8288a58 100644 --- a/terraform/cloudwatch-logs.tf +++ b/terraform/cloudwatch-logs.tf @@ -5,7 +5,7 @@ data "aws_region" "current" {} resource "aws_cloudwatch_log_group" "ipixel_results" { name = "/aws/ipixel/${var.site}" - retention_in_days = 30 + retention_in_days = 90 tags = { Site = var.site, diff --git a/terraform/lambda/.eslintrc.json b/terraform/lambda/.eslintrc.json index 7cde01d..031627b 100644 --- a/terraform/lambda/.eslintrc.json +++ b/terraform/lambda/.eslintrc.json @@ -1,3 +1,12 @@ { - "extends": "airbnb-base" + "extends": "twipped/node", + "rules": { + "node/no-unpublished-require": 0, + "indent": [ 2, 2, { + "MemberExpression": 1 + } ], + "node/no-unsupported-features/es-syntax": [ "error" ], + "node/no-unsupported-features/es-builtins": [ "error" ], + "node/no-unsupported-features/node-builtins": [ "error" ] + } } diff --git a/terraform/lambda/index.js b/terraform/lambda/index.js new file mode 100644 index 0000000..def9931 --- /dev/null +++ b/terraform/lambda/index.js @@ -0,0 +1,2 @@ + +module.exports = exports = require('./src/index'); diff --git a/terraform/lambda/package-lock.json b/terraform/lambda/package-lock.json index ff63714..18cd73e 100644 --- a/terraform/lambda/package-lock.json +++ b/terraform/lambda/package-lock.json @@ -305,6 +305,11 @@ "which": "^2.0.1" } }, + "date-fns": { + "version": "2.18.0", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.18.0.tgz", + "integrity": "sha512-NYyAg4wRmGVU4miKq5ivRACOODdZRY3q5WLmOJSq8djyzftYphU7dTHLcEtLqEvfqMKQ0jVv91P4BAwIjsXIcw==" + }, "debug": { "version": "4.3.1", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz", @@ -1547,6 +1552,11 @@ "integrity": "sha512-4dbzIzqvjtgiM5rw1k5rEHtBANKmdudhGyBEajN01fEyhaAIhsoKNy6y7+IN93IfpFtwY9iqi7kD+xwKhQsNJA==", "dev": true }, + "ua-parser-js": { + "version": "0.7.24", + "resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.24.tgz", + "integrity": "sha512-yo+miGzQx5gakzVK3QFfN0/L9uVhosXBBO7qmnk7c2iw1IhL212wfA3zbnI54B0obGwC/5NWub/iT9sReMx+Fw==" + }, "uri-js": { "version": "4.4.1", "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", diff --git a/terraform/lambda/package.json b/terraform/lambda/package.json index 6de2e7a..8370b9a 100644 --- a/terraform/lambda/package.json +++ b/terraform/lambda/package.json @@ -2,7 +2,9 @@ "name": "cloudfront-logs", "version": "0.0.1", "dependencies": { - "aws-sdk": "*" + "aws-sdk": "*", + "date-fns": "~2.18.0", + "ua-parser-js": "~0.7.24" }, "devDependencies": { "eslint": "*", @@ -11,5 +13,8 @@ }, "scripts": { "lint": "eslint ." + }, + "engines": { + "node": ">=12.14.0" } } diff --git a/terraform/lambda/src/cloudfront.js b/terraform/lambda/src/cloudfront.js index 816223b..de4cf6b 100644 --- a/terraform/lambda/src/cloudfront.js +++ b/terraform/lambda/src/cloudfront.js @@ -2,9 +2,20 @@ const { gunzip } = require('zlib'); const { promisify } = require('util'); const { S3 } = require('aws-sdk'); const { unescape } = require('querystring'); +const parseUA = require('ua-parser-js'); +const format = require('date-fns/format'); +const { URL } = require('url'); const gunzipAsync = promisify(gunzip); +function url (input) { + try { + const { hash, host, hostname, href, origin, password, pathname, port, protocol, search, searchParams, username } = new URL(input); // eslint-disable-line max-len + return { hash, host, hostname, href, origin, password, pathname, port, protocol, search, searchParams, username }; + } catch (e) { + return null; + } +} // Parsing the line containing the version. // @@ -31,7 +42,7 @@ const parseFields = (line) => { if (!line.startsWith('#Fields:')) { throw new Error(`Invalid fields line '${line}'`); } else { - return line.match(/[\w()-]+(\s|$)/g).map(field => ( + return line.match(/[\w()-]+(\s|$)/g).map((field) => ( // Strip parentheses and remove unecessary abbreviations in field names field.replace(/\(([^)]+)\)/, '-$1').replace(/^(c-|cs-|sc-)/, '').trim().toLowerCase() )); @@ -41,7 +52,7 @@ const parseFields = (line) => { // Unescape value twice (because fuck you that's why). // https://forums.aws.amazon.com/thread.jspa?threadID=134017 // -const decode = value => unescape(unescape(value)); +const decode = (value) => unescape(unescape(value)); // Split up line and assign to corresponding field. // @@ -49,11 +60,76 @@ const parseLine = (line, fields) => { if (line.startsWith('#')) { throw new Error(`Invalid log line '${line}'`); } else { - return line.split('\t').reduce((object, section, index) => { + let row = line.split('\t').reduce((object, section, index) => { const result = object; if (section !== '-') result[fields[index]] = decode(section); // Skip missing fields return result; }, {}); + + + // filter out OPTIONS calls + if (row.method === 'OPTIONS') return; + + // I only care about the pixel hits, nothing else. + if (row['uri-stem'] !== '/i') return; + + // this isn't an analytics event + if (!row.referer) return; + + row = Object.fromEntries(Object.entries(row).map(([ k, v ]) => [ k.replace(/-/g, '_'), v ])); + + const query = (row.uri_query) + ? Object.fromEntries(new URLSearchParams(row.uri_query)) + : {} + ; + + const useragent = parseUA(row.user_agent); + + const sessionStart = Number(query.start); + const sessionEnd = query.end === 'null' ? 0 : Number(query.end); + const duration = sessionEnd > sessionStart ? Math.floor((sessionEnd - sessionStart) / 1000) : null; + + let { + language, + viewed, + max_scroll, + page_height, + viewport_height, + } = query; + + max_scroll = parseInt(max_scroll, 10) || 0; + page_height = parseInt(page_height, 10) || 0; + viewport_height = parseInt(viewport_height, 10) || 0; + + const { pathname } = url(row.referer) || {}; + const { hostname: referrer_host, href: referrer } = url(query.referrer) || {}; + + const result = { + dts: `${row.date} ${row.time}`, + ip: row.ip, + tid: query.tid !== 'false' ? query.tid : null, + url: pathname, + referrer, + referrer_host, + client_start: format(new Date(sessionStart), 'yyyy-MM-dd HH:mm:ss'), + client_end: sessionEnd ? format(new Date(sessionStart), 'yyyy-MM-dd HH:mm:ss') : null, + duration, + language, + viewed, + max_scroll, + page_height, + viewport_height, + browser: useragent.browser.name, + browser_version: useragent.browser.major, + os: useragent.os.name + ' ' + useragent.os.version, + device_type: useragent.device && useragent.device.type || null, + device: useragent.device && useragent.device.vendor && useragent.device.vendor + ' ' + useragent.device.model || null, + useragent, + query, + original: row, + }; + + return result; } }; @@ -79,5 +155,9 @@ exports.parseLogFile = async ({ bucket, key, region }) => { // Shift next line containing fields format and parse it for validation const fields = parseFields(lines.shift()); - return lines.map(line => parseLine(line, fields)); + console.log(`Found ${lines.length} rows to parse`); // eslint-disable-line no-console + const rows = lines.map((line) => parseLine(line, fields)).filter(Boolean); + console.log(`Produced ${rows.length} results`); + console.log('Sample', rows[0]); + return rows; }; diff --git a/terraform/lambda/src/cloudwatch-logs.js b/terraform/lambda/src/cloudwatch-logs.js index c8a8945..dca5e73 100644 --- a/terraform/lambda/src/cloudwatch-logs.js +++ b/terraform/lambda/src/cloudwatch-logs.js @@ -15,7 +15,7 @@ const groupBy = (array, key) => ( if (result[item[key]]) { result[item[key]].push(item); } else if (item[key]) { - result[item[key]] = [item]; + result[item[key]] = [ item ]; } return result; }, {}) @@ -54,7 +54,7 @@ const describeLogStream = async (logStreamName) => { // Extend the original record with some additional fields // and encapsule records into CloudWatch Logs event. // -const buildlogEvents = records => ( +const buildlogEvents = (records) => ( records.map((record) => { const payload = record; payload.name = 'logs:cloudfront'; diff --git a/terraform/logging.tf b/terraform/logging.tf index b3dc209..aaab291 100644 --- a/terraform/logging.tf +++ b/terraform/logging.tf @@ -31,7 +31,7 @@ resource "aws_s3_bucket_notification" "ipixel_logs" { data "archive_file" "ipixel_parser" { type = "zip" - source_dir = "${path.module}/lambda/src" + source_dir = "${path.module}/lambda" output_path = ".terraform/tmp/lambda/ipixel_parser.zip" } @@ -40,8 +40,7 @@ resource "aws_lambda_function" "ipixel_parser" { runtime = "nodejs12.x" handler = "index.handler" - timeout = "24" - memory_size = "512" + timeout = 5 reserved_concurrent_executions = 3 environment { @@ -60,5 +59,8 @@ resource "aws_lambda_function" "ipixel_parser" { Role = "ipixel" } - depends_on = [aws_cloudwatch_log_group.ipixel_parser_logs] + depends_on = [ + aws_cloudwatch_log_group.ipixel_parser_logs, + aws_cloudwatch_log_group.ipixel_results, + ] }