commit d3905a0f62120317b3f11784744cda8797899583 Author: cn Date: Wed Nov 23 19:40:22 2016 +0100 Fiat lux diff --git a/README.md b/README.md new file mode 100644 index 0000000..77fb192 --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# nginx-metrics-graphite + +This is a Lua plugin for the Nginx web server that automatically collects and submits several important Nginx metrics to [Graphite](https://graphiteapp.org/) suitable for visualisation with e.g. [Grafana](http://grafana.org/). + +In constrast to commercial and proprietary solutions such as [Luameter](https://luameter.com/) or [NGINX Plus](https://www.nginx.com/products/) with it's [ngx_http_status_module](http://nginx.org/en/docs/http/ngx_http_status_module.html), this plugin is open source software while featuring more and additional metrics compared to those available via the open source [ngx_http_stub_status_module](http://nginx.org/en/docs/http/ngx_http_stub_status_module.html). + +This plugin takes inspiration from other Nginx metric libraries like [nginx-lua-prometheus](https://github.com/knyar/nginx-lua-prometheus) but differs fundamentally in the metrics submission handling by. In certain intervals it automatically pushes metrics to the configured Graphite (i.e. Carbon) host using pure Lua code instead of exposing them via a separate web page for polling. + +The metrics collection happens on every request for which the user configures a suitable `log_by_lua` direcitve and towards server-wide global counters (finer granularity might be added later). The counters are realized using a single shared dictionary across all Nginx worker threads which has constant memory usage (128 KiB currently, may be reduced further). + +Collected metrics in this prototype implementation: + +* number of requests +* average request duration +* accumulated request sizes over all requests +* accumulated response sizes over all requests +* HTTP status code classes (1xx, 2xx, 3xx, 4xx, 5xx) +* HTTP methods (GET, HEAD, PUT, POST, DELETE, OPTIONS, others) + +## Caveats + +A short metric submission interval might cause blocking on the Nginx worker threads since the shared dictionary storing all counters has to be locked. + +Intermittent network errors while communicating with Graphite might leed to permanent loss of metric information. + +## Install + +* Install `nginx-extra` (includes Lua support) on Debian Jessie +* Clone the nginx-metrics-graphite repository to */opt/nginx-metrics-graphite* +* Add the following config to top-level `http` block: + + ```nginx + resolver x.y.z.w; # DNS resolver IP address needed + + lua_shared_dict metrics_graphite 128k; + lua_package_path ";;/opt/nginx-metrics-graphite/?.lua"; + init_by_lua 'metrics_graphite = require("metrics_graphite").init("graphite.example.net", 300, "my.node.nginx_metrics.prefix")'; + init_worker_by_lua 'metrics_graphite:worker()'; + ``` + +* Instrument the `http` block or any server or location beneath it using `log_by_lua 'metrics_graphite:log()';` diff --git a/metrics_graphite.lua b/metrics_graphite.lua new file mode 100644 index 0000000..3b57d13 --- /dev/null +++ b/metrics_graphite.lua @@ -0,0 +1,152 @@ + +local MetricsGraphite = {} +MetricsGraphite.__index = MetricsGraphite + +function MetricsGraphite.init(carbon_host, interval, mbase) + local self = setmetatable({}, MetricsGraphite) + ngx.log(ngx.INFO, "nginx-metrics-graphite initializing on nginx version " .. ngx.config.nginx_version .. " with ngx_lua version " .. ngx.config.ngx_lua_version) + self.carbon_host = carbon_host + self.interval = interval + self.mbase = mbase + + -- metadata tables for more flexible metric creation + self.query_status = { + status_5xx = 500, + status_4xx = 400, + status_3xx = 300, + status_2xx = 200, + status_1xx = 100 + } + self.query_method = { + method_get = "GET", + method_head = "HEAD", + method_put = "PUT", + method_post = "POST", + method_delete = "DELETE", + method_options = "OPTIONS", + method_other = "" + } + + -- initialize/reset counters + self.stats = ngx.shared.metrics_graphite -- TODO: unclear whether ngx.shared.DICT is thread-safe? + self.stats:set("main_loop_worker", 0) + self.stats:set("requests", 0) + self.stats:set("request_length", 0) + self.stats:set("bytes_sent", 0) + + self.stats:set("request_time_sum", 0) + self.stats:set("request_time_num", 0) + + for k,v in pairs(self.query_status) do + self.stats:set(k, 0) + end + + for k,v in pairs(self.query_method) do + self.stats:set(k, 0) + end + + return self +end + +function MetricsGraphite:worker() + -- determine which worker should handle the main loop, relies on thread-safety of ngx.shared.DICT:incr + if self.stats:incr("main_loop_worker", 1) ~= 1 then + return + end + + ngx.log(ngx.INFO, "nginx-metrics-graphite main loop worker PID is " .. ngx.worker.pid()) + + local this = self + local callback + + callback = function (premature) + -- first create the new timer to keep our intervals as good as possible + -- (not when called premature since nginx is going to shut down soon) + if not premature then + local ok, err = ngx.timer.at(this.interval, callback) + if not ok then + ngx.log(ngx.ERR, "nginx-metrics-graphite callback failed to create interval timer: ", err) + return + end + end + + -- then do the work which might incur delays + local sock, err = ngx.socket.tcp() + if err then + ngx.log(ngx.ERR, "nginx-metrics-graphite callback failed to create carbon socket: ", err) + return + end + + -- connect to carbon host with submission port via TCP + local ok, err = sock:connect(this.carbon_host, 2003) + if not ok then + ngx.log(ngx.ERR, "nginx-metrics-graphite callback failed to connect carbon socket: ", err) + return + end + + local avg_request_time = this.stats:get("request_time_sum") / this.stats:get("request_time_num") + self.stats:set("request_time_sum", 0) + self.stats:set("request_time_num", 0) + + -- submit metrics + sock:send(this.mbase .. ".nginx_test.test" .. ngx.worker.pid() .. " 1 " .. ngx.time() .. "\n") + sock:send(this.mbase .. ".nginx_test.num_requests " .. this.stats:get("requests") .. " " .. ngx.time() .. "\n") + sock:send(this.mbase .. ".nginx_test.acc_request_length " .. this.stats:get("request_length") .. " " .. ngx.time() .. "\n") + sock:send(this.mbase .. ".nginx_test.acc_bytes_sent " .. this.stats:get("bytes_sent") .. " " .. ngx.time() .. "\n") + sock:send(this.mbase .. ".nginx_test.avg_request_time " .. avg_request_time .. " " .. ngx.time() .. "\n") + + for k,v in pairs(self.query_status) do + sock:send(this.mbase .. ".nginx_test.num_" .. k .. " " .. this.stats:get(k) .. " " .. ngx.time() .. "\n") + end + + for k,v in pairs(self.query_method) do + sock:send(this.mbase .. ".nginx_test.num_" .. k .. " " .. this.stats:get(k) .. " " .. ngx.time() .. "\n") + end + + sock:close() + end + + -- start first timer + local ok, err = ngx.timer.at(this.interval, callback) + if not ok then + ngx.log(ngx.ERR, "nginx-metrics-graphite callback failed to create interval timer: ", err) + return + end +end + +function MetricsGraphite:log() + -- function by default called on every request, + -- should be fast and only do important calculations here + self.stats:incr("requests", 1) + + for k,v in pairs(self.query_status) do + if ngx.status >= v and ngx.status < v+100 then + self.stats:incr(k, 1) + break + end + end + + local is_method_other = true + for k,v in pairs(self.query_method) do + if ngx.req.get_method() == v then + self.stats:incr(k, 1) + is_method_other = false + break + end + end + if is_method_other then + self.stats:incr("method_other", 1) + end + + local request_length = ngx.var.request_length -- in bytes + self.stats:incr("request_length", request_length) + + local bytes_sent = ngx.var.bytes_sent -- in bytes + self.stats:incr("bytes_sent", bytes_sent) + + local request_time = ngx.now() - ngx.req.start_time() -- in seconds + self.stats:incr("request_time_sum", request_time) + self.stats:incr("request_time_num", 1) +end + +return MetricsGraphite