moved analytics tooling out of this repo

This commit is contained in:
Jocelyn Badgley 2024-10-06 10:42:50 -07:00
parent 16e00a195b
commit d818495fdc
No known key found for this signature in database
GPG Key ID: 909059F0FFF842D8
10 changed files with 0 additions and 2143 deletions

View File

@ -1,9 +0,0 @@
{
"extends": "@twipped/eslint-config/node-cjs",
"rules": {
"node/no-unpublished-require": 0,
"indent": [ 2, 2, {
"MemberExpression": 1
} ]
}
}

View File

@ -1,132 +0,0 @@
/**
*
* @twipped/utils
*
* Copyright (c) 2020, Jocelyn Badgley
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
'use strict';
Object.defineProperty(exports, '__esModule', { value: true });
var path = require('path');
var fs = require('fs');
var stream = require('stream');
var util = require('util');
function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
var stream__default = /* #__PURE__*/_interopDefaultLegacy(stream);
const pipeline = util.promisify(stream__default.default.pipeline);
const mkdir = (f, recursive = true) => fs.promises.mkdir(f, {
recursive,
});
const exists = (f) => fs.promises.access(f).then(() => true, () => false);
const stat = (f) => fs.promises.stat(f).catch(() => null);
const linkStat = (f) => fs.promises.lstat(f).catch(() => null);
async function isWritable (file) {
try {
await fs.promises.access(file, fs.constants.F_OK | fs.constants.W_OK);
return true;
} catch (err) {
if (err.code === 'ENOENT') {
return await fs.promises.access(path.dirname(file), fs.constants.F_OK | fs.constants.W_OK).then(() => true, () => false);
}
return false;
}
}
async function touch (file) {
const stats = await linkStat(file);
if (stats) {
if (stats.isDirectory()) return; // nothing to do
return await fs.promises.utimes(file, new Date(), new Date());
}
if (!(await exists(path.dirname(file)))) await mkdir(path.dirname(file));
await fs.promises.writeFile(file, '');
}
async function remove (file) {
const stats = await linkStat(file);
if (!stats) return;
if (stats.isDirectory()) {
return fs.promises.rmdir(file, {
recursive: true,
});
}
return fs.promises.unlink(file);
}
async function writeJson (file, object, options) {
const {
replacer,
spaces,
...ops
} = {
encoding: 'utf8',
...options,
};
await fs.promises.writeFile(file, `${JSON.stringify(object, replacer, spaces)}\n`, ops);
}
const writeJSON = writeJson;
async function readJson (file, options) {
const {
reviver,
quiet,
...ops
} = {
encoding: 'utf8',
...options,
};
const content = await fs.promises.readFile(file, ops);
try {
return JSON.parse(stripBom(content), reviver);
} catch (err) {
if (!quiet) throw err;
return undefined;
}
}
const readJSON = readJson;
function stripBom (content) {
if (Buffer.isBuffer(content)) {
content = content.toString('utf8');
}
return content.replace(/^\uFEFF/, '');
}
exports.exists = exists;
exports.isWritable = isWritable;
exports.linkStat = linkStat;
exports.mkdir = mkdir;
exports.pipeline = pipeline;
exports.readJSON = readJSON;
exports.readJson = readJson;
exports.remove = remove;
exports.stat = stat;
exports.touch = touch;
exports.writeJSON = writeJSON;
exports.writeJson = writeJson;

View File

@ -1,235 +0,0 @@
/* eslint no-console:0 */
const path = require('path');
const { URL } = require('url');
const CloudFrontParser = require('cloudfront-log-parser');
const parseUA = require('ua-parser-js');
const format = require('date-fns/format');
const zlib = require('zlib');
const { pipeline } = require('./fs');
const { Readable, Transform, Writable } = require('stream');
const { open: opensql } = require('sqlite');
const sqlite3 = require('sqlite3');
const sql = require('./sql-tag');
let fs = require('fs');
fs = { ...fs, ...fs.promises };
function url (input) {
try {
const { hash, host, hostname, href, origin, password, pathname, port, protocol, search, searchParams, username } = new URL(input); // eslint-disable-line max-len
return { hash, host, hostname, href, origin, password, pathname, port, protocol, search, searchParams, username };
} catch (e) {
return null;
}
}
// function asyncthrough (...args) {
// const [ fn, donefn ] = args;
// args[0] = function (file, enc, next) {
// fn(this, file, enc).then(() => next(), (err) => { console.error(err, 'Error thrown'); next(err); });
// };
// if (donefn) {
// args[1] = function (next) {
// donefn(this).then(() => next(), (err) => { console.error(err, 'Error thrown'); next(err); });
// };
// }
// return through.obj(...args);
// }
const parser = new CloudFrontParser({ format: 'web' });
async function* loadFiles () {
const dir = path.resolve(__dirname, 'RAW');
for await (const f of await fs.opendir(dir)) {
if (!f.isFile()) continue;
const fpath = path.resolve(dir, f.name);
const file = path.parse(fpath);
if (file.ext !== '.gz') continue;
// console.log(file);
const filestream = fs.createReadStream(fpath).pipe(zlib.createGunzip());
for await (const chunk of filestream) {
yield chunk;
}
}
}
(async () => {
// open the database
const db = await opensql({
filename: path.resolve(__dirname, 'database.sqlite'),
driver: sqlite3.Database,
});
await db.run(sql`
CREATE TABLE IF NOT EXISTS records (
dts INTEGER,
ip TEXT,
tid INTEGER,
url TEXT,
referrer TEXT,
referrer_host TEXT,
client_start INTEGER,
client_end INTEGER,
duration INTEGER,
language TEXT,
scrolled INTEGER,
max_scroll INTEGER,
page_height INTEGER,
viewport_height INTEGER,
browser TEXT,
browser_version INTEGER,
os TEXT,
device_type TEXT,
device TEXT
)
`);
await db.exec(sql`
CREATE UNIQUE INDEX IF NOT EXISTS entries ON records (
dts,
ip,
tid
);
`);
await db.run('PRAGMA busy_timeout = 6000');
const stmt = await db.prepare(sql`
REPLACE INTO records VALUES (
:dts,
:ip,
:tid,
:url,
:referrer,
:referrer_host,
:client_start,
:client_end,
:duration,
:language,
:scrolled,
:max_scroll,
:page_height,
:viewport_height,
:browser,
:browser_version,
:os,
:device_type,
:device
);
`);
let counter = 0;
await pipeline(
Readable.from(loadFiles()),
parser,
new Transform({
readableObjectMode: true,
writableObjectMode: true,
transform (row, encoding, done) {
// filter out OPTIONS calls
if (row['cs-method'] === 'OPTIONS') return done();
// I only care about the pixel hits, nothing else.
if (row['cs-uri-stem'] !== '/i') return done();
// this isn't an analytics event
if (row['cs-referer'] === '-') return done();
row = Object.fromEntries(Object.entries(row).map(([ k, v ]) => [ k.replace(/-/g, '_'), v ]));
const query = (row.cs_uri_query === '-')
? {}
: Object.fromEntries(new URLSearchParams(row.cs_uri_query))
;
// we didn't get analytics data from this load, ignore it
if (!query.start) return done();
const useragent = parseUA(row.cs_user_agent);
const sessionStart = Number(query.start);
const sessionEnd = query.end === 'null' ? 0 : Number(query.end);
const duration = sessionEnd > sessionStart ? Math.floor((sessionEnd - sessionStart) / 1000) : null;
let {
language,
viewed,
max_scroll,
page_height,
viewport_height,
} = query;
max_scroll = parseInt(max_scroll, 10) || 0;
page_height = parseInt(page_height, 10) || 0;
viewport_height = parseInt(viewport_height, 10) || 0;
const { pathname } = url(row.cs_referer) || {};
const { hostname: referrer_host, href: referrer } = url(query.referrer) || {};
const result = {
dts: `${row.date} ${row.time}`,
ip: row.c_ip,
tid: query.tid !== 'false' ? query.tid : null,
url: pathname,
referrer,
referrer_host,
client_start: format(new Date(sessionStart), 'yyyy-MM-dd HH:mm:ss'),
client_end: sessionEnd ? format(new Date(sessionStart), 'yyyy-MM-dd HH:mm:ss') : null,
duration,
language,
scrolled: viewed,
max_scroll,
page_height,
viewport_height,
browser: useragent.browser.name,
browser_version: useragent.browser.major,
os: useragent.os.name + ' ' + useragent.os.version,
device_type: useragent.device && useragent.device.type || null,
device: useragent.device && useragent.device.vendor && useragent.device.vendor + ' ' + useragent.device.model || null,
};
this.push(result);
done();
},
}),
new Writable({
objectMode: true,
// highWaterMark: 2,
write (record, encoding, done) {
(async () => {
const params = Object.fromEntries(
Object.entries(record).map(([ k, v ]) => [ ':' + k, v || null ]),
);
while (true) {
try {
await stmt.run(params);
break;
} catch (err) {
if (err.code !== 'SQLITE_BUSY') throw err;
}
}
counter++;
if (!(counter % 10)) process.stdout.write('.');
})().then(() => done(), done);
},
}),
);
await stmt.finalize();
await db.close();
})().then(
() => process.exit(),
(err) => {
console.error(err.stack);
process.exit(1);
},
);

File diff suppressed because it is too large Load Diff

View File

@ -1,31 +0,0 @@
{
"name": "decorate",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"start": "node index.js",
"sync": "aws s3 sync s3://gdbible-analytics/RAW ./RAW",
"empty": "aws s3 rm s3://gdbible-analytics/RAW --recursive",
"combine": "find RAW -name '*.gz' -exec cat '{}' ';' > combined.log"
},
"author": "Jocelyn Badgley <joc@twipped.com> (http://twipped.com/)",
"license": "MIT",
"dependencies": {
"cloudfront-log-parser": "~1.2.0",
"date-fns": "~2.30.0",
"glob-stream": "~8.0.0",
"named-placeholders": "~1.1.3",
"readable-stream": "~4.4.0",
"split2": "~4.2.0",
"sqlite": "~4.2.0",
"sqlite3": "~5.1.6",
"stream-chain": "~2.2.5",
"through2": "~4.0.2",
"ua-parser-js": "~1.0.35"
},
"engines": {
"node": ">=12.14.0"
},
"devDependencies": {}
}

View File

@ -1,32 +0,0 @@
SELECT
date(dts) as day,
count(DISTINCT IFNULL(tid, ip)) as tids
FROM records
GROUP BY date(dts);
SELECT
(duration / 60) as minutes,
COUNT(IFNULL(tid,ip)) as total
FROM records
WHERE duration > 1 AND duration < (60 * 30)
GROUP BY duration / 60
HAVING total > 5;
SELECT referrer_host, count(DISTINCT IFNULL(tid, ip)) as tids, referrer
FROM records
WHERE date(dts) > date('now', '-1 month')
AND referrer_host != 'genderdysphoria.fyi'
GROUP BY referrer_host
ORDER BY tids DESC;
SELECT referrer_host, count(DISTINCT IFNULL(tid, ip)) as tids, referrer
FROM records
WHERE date(dts) > date('now', '-1 day')
AND INSTR(referrer_host, 'tiktok')
GROUP BY referrer_host
ORDER BY tids DESC;
SELECT COUNT(IFNULL(tid,ip)) as total, referrer
FROM records
WHERE referrer_host LIKE '%reddit.com'
GROUP BY referrer

View File

@ -1,59 +0,0 @@
const path = require('path');
const { open: opensql } = require('sqlite');
const sqlite3 = require('sqlite3');
const sql = require('../sql-tag');
(async () => {
// open the database
const db = await opensql({
filename: path.resolve(__dirname, '..', 'database.sqlite'),
driver: sqlite3.Database,
});
await db.exec(sql`
CREATE INDEX IF NOT EXISTS entries ON records (
referrer_host
);
`);
const rows = await db.all(sql`
SELECT referrer_host, count(DISTINCT IFNULL(tid, ip)) as tids
FROM records
WHERE date(dts) > date('now', '-12 month')
AND referrer_host != 'genderdysphoria.fyi'
AND referrer_host != 'www.genderdysphoria.fyi'
GROUP BY referrer_host
ORDER BY tids DESC;
`);
const hosts = new Map();
for (const {referrer_host, tids} of rows) {
const host = matchHost(referrer_host);
const row = hosts.get(host) || { referrer_host, count: 0 };
const current = row.count || 0;
hosts.set(host, { host, referrer_host: row.referrer_host, count: current + tids });
}
let results = Array.from(hosts.values(), ({ host, referrer_host, count }) => [referrer_host, count])
results = results.sort((a,b) => b[1] - a[1]);
results = results.slice(0, 100);
// console.table(results);
for (const [host, count] of results) {
process.stdout.write(host.padEnd(38, ' ') + count + '\n');
}
})().catch(console.error);
const MATCH = /^t\.co$|(?:[^.]+)(?=(?:\.com?)?\.[A-za-z]{2,}$)/;
function matchHost (input) {
if (input === 'com.andrewshu.android.reddit') return 'reddit';
if (input.startsWith('com.laurencedawson.reddit_sync')) return 'reddit';
if (input === 'genderdysphoria-fyi.translate.goog') return 'google';
if (input.endsWith('.messenger.com')) return 'facebook';
if (input.startsWith('com.')) {
input = input.split('.').reverse().join('.');
}
const [host] = input.match(MATCH) || [input];
return host;
}

View File

@ -1,27 +0,0 @@
const path = require('path');
const { open: opensql } = require('sqlite');
const sqlite3 = require('sqlite3');
const sql = require('../sql-tag');
(async () => {
// open the database
const db = await opensql({
filename: path.resolve(__dirname, '..', 'database.sqlite'),
driver: sqlite3.Database,
});
const rows = await db.all(sql`
SELECT
date(dts) as day,
count(DISTINCT IFNULL(tid, ip)) as tids
FROM records
WHERE date(dts) > date('now', '-12 month')
GROUP BY date(dts);
`);
// console.table(results);
for (const { day, tids } of rows) {
process.stdout.write(day + '\t' + tids + '\n');
}
})().catch(console.error);

View File

@ -1,48 +0,0 @@
const namedParams = require('named-placeholders')();
function stripIndent (input) {
if (Array.isArray(input)) return input.map(stripIndent).join('');
const match = input.match(/^[^\S\n]*(?=\S)/gm);
const indent = match && Math.min(...match.map((el) => el.length));
if (indent) {
const regexp = new RegExp(`^.{${indent}}`, 'gm');
input = input.replace(regexp, '');
}
return input;
}
function isObject (input, strict = false) {
if (!input) return false;
if (typeof input !== 'object') return false;
if (Array.isArray(input)) return false;
if (!strict) return true;
if (!(input instanceof Object)) return false;
if (input.constructor !== Object.prototype.constructor) return false;
return true;
}
const isNotUndefinedOrNull = (input) => input !== null && typeof input !== 'undefined';
const valueOrEmpty = (input) => (isNotUndefinedOrNull(input) ? input : '');
function withData (data) {
return function (...args) {
const query = sql(...args);
return namedParams(query, data);
};
}
function sql (strings, ...values) {
const input = strings.reduce((str, chunk, i) => (
str + chunk + valueOrEmpty(values[i])
), '');
return stripIndent(input);
}
module.exports = exports = (...args) => {
if (args.length === 0 || (args.length === 1 && isObject(args[0]))) {
return withData(args[0] || {});
}
if (Array.isArray(args[0])) return sql(...args);
throw new TypeError('Unknown invocation of sql-tag');
};

View File

@ -1,43 +0,0 @@
var Readable = require('readable-stream').Readable;
function identity (_in) {
return _in;
}
/**
* Returns a stream for any paged AWS function
* you can optionally provide a mapping function
* like S3::listObjectsV2()
*
* @param {function} req - a non executed AWS function
* @param {function} fn - a function that selects/maps the results
* @param {object} opts - stream options
*/
function s3PageStream (req, fn, opts) {
opts = Object.assign({}, opts, { read, objectMode: true });
if (!fn) fn = identity;
var stream = new Readable(opts);
return stream;
function read () {
if (!req) return;
var _req = req;
req = null; // poor man's once!
_req.send(page_handler);
}
function page_handler (e, data) {
if (e) return stream.destroy(e);
data.Contents.forEach((obj) => {
stream.push(fn(obj));
});
var nextPage = this.hasNextPage() ? this.nextPage() : null;
if (nextPage) nextPage.send(page_handler);
else stream.push(null);
}
}