From c97adf0dc4c3e07f44cfc9d7e4ee2a6b852cee37 Mon Sep 17 00:00:00 2001
From: JuanLeon Lahoz <juanleon.lahoz@gmail.com>
Date: Wed, 18 Aug 2021 08:23:06 +0200
Subject: [PATCH] feature: Implement the common-term plot

---
 .github/workflows/test.yml |  4 +-
 Cargo.lock                 |  4 +-
 Cargo.toml                 |  2 +-
 README.md                  | 47 +++++++++++++-------
 src/app.rs                 | 28 ++++++++++--
 src/main.rs                | 40 +++++++++++++++--
 src/plot/histogram.rs      |  2 +-
 src/plot/mod.rs            |  2 +
 src/plot/terms.rs          | 90 ++++++++++++++++++++++++++++++++++++++
 src/read/buckets.rs        | 53 ++++++++++++++++++++--
 src/read/dateparser.rs     |  6 +--
 src/read/times.rs          |  2 +-
 12 files changed, 244 insertions(+), 36 deletions(-)
 create mode 100644 src/plot/terms.rs

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 15164f4..b01948b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,5 +21,7 @@ jobs:
       run: cargo test -- --test-threads=1
     - name: Check format
       run: cargo fmt -- --check
+    - name: Get clippy version
+      run: cargo clippy -V
     - name: Run clippy
-      run: cargo clippy -- -D clippy
+      run: cargo clippy -- -D clippy::all
diff --git a/Cargo.lock b/Cargo.lock
index af06c9c..4678e14 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,5 +1,7 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
+version = 3
+
 [[package]]
 name = "aho-corasick"
 version = "0.7.15"
@@ -294,7 +296,7 @@ dependencies = [
 
 [[package]]
 name = "lowcharts"
-version = "0.4.1"
+version = "0.4.2"
 dependencies = [
  "assert_cmd",
  "atty",
diff --git a/Cargo.toml b/Cargo.toml
index d4dadd7..62307ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lowcharts"
-version = "0.4.1"
+version = "0.4.2"
 authors = ["JuanLeon Lahoz <juanleon.lahoz@gmail.com>"]
 edition = "2018"
 description = "Tool to draw low-resolution graphs in terminal"
diff --git a/README.md b/README.md
index dd64c18..121e42e 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ terminal.
 Type `lowcharts --help`, or `lowcharts PLOT-TYPE --help` for a complete list of
 options.
 
-Currently five basic types of plots are supported:
+Currently six basic types of plots are supported:
 
 #### Bar chart for matches in the input
 
@@ -33,7 +33,7 @@ This chart is generated using `lowcharts matches database.log SELECT UPDATE DELE
 
 [![Simple bar chart with lowcharts](resources/matches-example.png)](resources/matches-example.png)
 
-#### Histogram
+#### Histogram for numerical inputs
 
 This chart is generated using `python3 -c 'import random; [print(random.normalvariate(5, 5)) for _ in range(100000)]' | lowcharts hist`:
 
@@ -85,19 +85,6 @@ each ∎ represents a count of 228
 [0.044 .. 0.049] [  183]
 ```
 
-#### X-Y Plot
-
-This chart is generated using  `cat ram-usage | lowcharts plot --height 20 --width 50`:
-
-[![Sample plot with lowcharts](resources/plot-example.png)](resources/plot-example.png)
-
-Note that x axis is not labelled.  The tool splits the input data by chunks of a
-fixed size and then the chart display the averages of those chunks.  In other
-words: grouping data by time is not (yet?) supported; you can see the evolution
-of a metric over time, but not the speed of that evolution.
-
-There is regex support for this type of plots.
-
 #### Time Histogram
 
 This chart is generated using  `strace -tt ls -lR * 2>&1 | lowcharts timehist --intervals 10`:
@@ -109,7 +96,7 @@ similar way, and would give you a glimpse of when and how many 404s are being
 triggered in your server.
 
 The idea is to depict the frequency of logs that match a regex (by default any
-log that is read by the tool).  The sub-command can autodetect the more common
+log that is read by the tool).  The sub-command can autodetect the most common
 (in my personal and biased experience) datetime/timestamp formats: rfc 3339, rfc
 2822, python `%(asctime)s`, golang default log format, nginx, rabbitmq, strace
 -t (or -tt, or -ttt),ltrace,... as long as the timestamp is present in the first
@@ -130,12 +117,38 @@ timezones).
 
 This adds up the time histogram and bar chart in a single visualization.
 
-This chart is generated using  `strace -tt ls -lR 2>&1 | lowcharts split-timehist open mmap close read write --intervals 10`:
+This chart is generated using `strace -tt ls -lR 2>&1 | lowcharts split-timehist open mmap close read write --intervals 10`:
 
 [![Sample plot with lowcharts](resources/split-timehist-example.png)](resources/split-timehist-example.png)
 
 This graph depicts the relative frequency of search terms in time.
 
+#### Common terms histogram
+
+Useful for plotting most common terms in input lines.
+
+This sample chart is generated using `strace ls -l 2>&1 | lowcharts common-terms --lines 8 -R '(.*?)\('`:
+
+[![Sample plot with lowcharts](resources/common-terms-example.png)](resources/common-terms-example.png)
+
+The graph depicts the 8 syscalls most used by `ls -l` command, along with its
+number of uses and sorted.  In general, using `lowcharts common-terms` is a
+handy substitute to commands of the form `awk ... | sort | uniq -c | sort -rn |
+head`.
+
+#### X-Y Plot
+
+This chart is generated using  `cat ram-usage | lowcharts plot --height 20 --width 50`:
+
+[![Sample plot with lowcharts](resources/plot-example.png)](resources/plot-example.png)
+
+Note that x axis is not labelled.  The tool splits the input data by chunks of a
+fixed size and then the chart display the averages of those chunks.  In other
+words: grouping data by time is not (yet?) supported; you can see the evolution
+of a metric over time, but not the speed of that evolution.
+
+There is regex support for this type of plots.
+
 ### Installing
 
 #### Via release
diff --git a/src/app.rs b/src/app.rs
index d5fac64..87f7a40 100644
--- a/src/app.rs
+++ b/src/app.rs
@@ -46,8 +46,7 @@ lines.
 By default this will use a capture group named `value`.  If not present, it will
 use first capture group.
 
-If no regex is used, a number per line is expected (something that can be parsed
-as float).
+If no regex is used, the whole input lines will be matched.
 
 Examples of regex are ' 200 \\d+ ([0-9.]+)' (where there is one anonymous capture
 group) and 'a(a)? (?P<value>[0-9.]+)' (where there are two capture groups, and
@@ -68,7 +67,7 @@ fn add_non_capturing_regex(app: App) -> App {
         Arg::new("regex")
             .long("regex")
             .short('R')
-            .about("Filter out lines where regex is notr present")
+            .about("Filter out lines where regex is not present")
             .takes_value(true),
     )
 }
@@ -170,6 +169,19 @@ pub fn get_app() -> App<'static> {
             .multiple(true),
     );
 
+    let mut common_terms = App::new("common-terms")
+        .version(clap::crate_version!())
+        .setting(AppSettings::ColoredHelp)
+        .about("Plot histogram with most common terms in input lines");
+    common_terms = add_input(add_regex(add_width(common_terms))).arg(
+        Arg::new("lines")
+            .long("lines")
+            .short('l')
+            .about("Display that many lines, sorting by most frequent")
+            .default_value("10")
+            .takes_value(true),
+    );
+
     App::new("lowcharts")
         .author(clap::crate_authors!())
         .version(clap::crate_version!())
@@ -198,6 +210,7 @@ pub fn get_app() -> App<'static> {
         .subcommand(matches)
         .subcommand(timehist)
         .subcommand(splittimehist)
+        .subcommand(common_terms)
 }
 
 #[cfg(test)]
@@ -279,4 +292,13 @@ mod tests {
             sub_m.values_of("match").unwrap().collect::<Vec<&str>>()
         );
     }
+
+    #[test]
+    fn terms_subcommand_arg_parsing() {
+        let arg_vec = vec!["lowcharts", "common-terms", "--regex", "foo", "some"];
+        let m = get_app().get_matches_from(arg_vec);
+        let sub_m = m.subcommand_matches("common-terms").unwrap();
+        assert_eq!("some", sub_m.value_of("input").unwrap());
+        assert_eq!("foo", sub_m.value_of("regex").unwrap());
+    }
 }
diff --git a/src/main.rs b/src/main.rs
index 732007d..267a389 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -85,7 +85,7 @@ fn get_float_reader(matches: &ArgMatches) -> Result<read::DataReader, ()> {
         builder.range(min..max);
     }
     if let Some(string) = matches.value_of("regex") {
-        match Regex::new(&string) {
+        match Regex::new(string) {
             Ok(re) => {
                 builder.regex(re);
             }
@@ -100,7 +100,7 @@ fn get_float_reader(matches: &ArgMatches) -> Result<read::DataReader, ()> {
 
 /// Implements the hist cli-subcommand
 fn histogram(matches: &ArgMatches) -> i32 {
-    let reader = match get_float_reader(&matches) {
+    let reader = match get_float_reader(matches) {
         Ok(r) => r,
         _ => return 2,
     };
@@ -122,7 +122,7 @@ fn histogram(matches: &ArgMatches) -> i32 {
 
 /// Implements the plot cli-subcommand
 fn plot(matches: &ArgMatches) -> i32 {
-    let reader = match get_float_reader(&matches) {
+    let reader = match get_float_reader(matches) {
         Ok(r) => r,
         _ => return 2,
     };
@@ -155,11 +155,42 @@ fn matchbar(matches: &ArgMatches) -> i32 {
     0
 }
 
+/// Implements the common-terms cli-subcommand
+fn common_terms(matches: &ArgMatches) -> i32 {
+    let mut builder = read::DataReaderBuilder::default();
+    if let Some(string) = matches.value_of("regex") {
+        match Regex::new(string) {
+            Ok(re) => {
+                builder.regex(re);
+            }
+            _ => {
+                error!("Failed to parse regex {}", string);
+                return 1;
+            }
+        };
+    } else {
+        builder.regex(Regex::new("(.*)").unwrap());
+    };
+    let reader = builder.build().unwrap();
+    let width = matches.value_of_t("width").unwrap();
+    let lines = matches.value_of_t("lines").unwrap();
+    if lines < 1 {
+        error!("You should specify a potitive number of lines");
+        return 2;
+    };
+    print!(
+        "{:width$}",
+        reader.read_terms(matches.value_of("input").unwrap(), lines),
+        width = width
+    );
+    0
+}
+
 /// Implements the timehist cli-subcommand
 fn timehist(matches: &ArgMatches) -> i32 {
     let mut builder = read::TimeReaderBuilder::default();
     if let Some(string) = matches.value_of("regex") {
-        match Regex::new(&string) {
+        match Regex::new(string) {
             Ok(re) => {
                 builder.regex(re);
             }
@@ -236,6 +267,7 @@ fn main() {
         Some(("plot", subcommand_matches)) => plot(subcommand_matches),
         Some(("matches", subcommand_matches)) => matchbar(subcommand_matches),
         Some(("timehist", subcommand_matches)) => timehist(subcommand_matches),
+        Some(("common-terms", subcommand_matches)) => common_terms(subcommand_matches),
         Some(("split-timehist", subcommand_matches)) => splittime(subcommand_matches),
         _ => unreachable!("Invalid subcommand"),
     });
diff --git a/src/plot/histogram.rs b/src/plot/histogram.rs
index ba424a9..23905fd 100644
--- a/src/plot/histogram.rs
+++ b/src/plot/histogram.rs
@@ -77,7 +77,7 @@ impl fmt::Display for Histogram {
         let writer = HistWriter {
             width: f.width().unwrap_or(110),
         };
-        writer.write(f, &self)
+        writer.write(f, self)
     }
 }
 
diff --git a/src/plot/mod.rs b/src/plot/mod.rs
index 1850e7f..a904e85 100644
--- a/src/plot/mod.rs
+++ b/src/plot/mod.rs
@@ -1,12 +1,14 @@
 pub use self::histogram::Histogram;
 pub use self::matchbar::{MatchBar, MatchBarRow};
 pub use self::splittimehist::SplitTimeHistogram;
+pub use self::terms::CommonTerms;
 pub use self::timehist::TimeHistogram;
 pub use self::xy::XyPlot;
 
 mod histogram;
 mod matchbar;
 mod splittimehist;
+mod terms;
 mod timehist;
 mod xy;
 
diff --git a/src/plot/terms.rs b/src/plot/terms.rs
new file mode 100644
index 0000000..2032cc4
--- /dev/null
+++ b/src/plot/terms.rs
@@ -0,0 +1,90 @@
+use std::collections::HashMap;
+use std::fmt;
+
+use yansi::Color::{Blue, Green, Red};
+
+#[derive(Debug)]
+pub struct CommonTerms {
+    pub terms: HashMap<String, usize>,
+    lines: usize,
+}
+
+impl CommonTerms {
+    pub fn new(lines: usize) -> CommonTerms {
+        CommonTerms {
+            terms: HashMap::new(),
+            lines,
+        }
+    }
+
+    pub fn observe(&mut self, term: String) {
+        *self.terms.entry(term).or_insert(0) += 1
+    }
+}
+
+impl fmt::Display for CommonTerms {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let width = f.width().unwrap_or(100);
+        let mut counts: Vec<(&String, &usize)> = self.terms.iter().collect();
+        if counts.is_empty() {
+            writeln!(f, "No data")?;
+            return Ok(());
+        }
+        counts.sort_by(|a, b| b.1.cmp(a.1));
+        let values = &counts[..self.lines.min(counts.len())];
+        let label_width = values.iter().fold(1, |acc, x| acc.max(x.0.len()));
+        let divisor = 1.max(counts[0].1 / width);
+        let width_count = format!("{}", counts[0].1).len();
+        writeln!(
+            f,
+            "Each {} represents a count of {}",
+            Red.paint("∎"),
+            Blue.paint(divisor.to_string()),
+        )?;
+        for (term, count) in values.iter() {
+            writeln!(
+                f,
+                "[{label}] [{count}] {bar}",
+                label = Blue.paint(format!("{:>width$}", term, width = label_width)),
+                count = Green.paint(format!("{:width$}", count, width = width_count)),
+                bar = Red.paint(format!("{:∎<width$}", "", width = *count / divisor))
+            )?;
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use yansi::Paint;
+
+    #[test]
+    fn test_common_terms_empty() {
+        let terms = CommonTerms::new(10);
+        Paint::disable();
+        let display = format!("{}", terms);
+        assert_eq!(display, "No data\n");
+    }
+
+    #[test]
+    fn test_common_terms() {
+        let mut terms = CommonTerms::new(2);
+        for _ in 0..100 {
+            terms.observe(String::from("foo"));
+        }
+        for _ in 0..10 {
+            terms.observe(String::from("arrrrrrrr"));
+        }
+        for _ in 0..20 {
+            terms.observe(String::from("barbar"));
+        }
+        Paint::disable();
+        let display = format!("{:10}", terms);
+
+        println!("{}", display);
+        assert!(display.contains("[   foo] [100] ∎∎∎∎∎∎∎∎∎∎\n"));
+        assert!(display.contains("[barbar] [ 20] ∎∎\n"));
+        assert!(!display.contains("arr"));
+    }
+}
diff --git a/src/read/buckets.rs b/src/read/buckets.rs
index cbbbc0f..51adc99 100644
--- a/src/read/buckets.rs
+++ b/src/read/buckets.rs
@@ -3,7 +3,7 @@ use std::ops::Range;
 
 use regex::Regex;
 
-use crate::plot::{MatchBar, MatchBarRow};
+use crate::plot::{CommonTerms, MatchBar, MatchBarRow};
 use crate::read::open_file;
 
 #[derive(Debug, Default, Builder)]
@@ -24,7 +24,7 @@ impl DataReader {
         for line in open_file(path).lines() {
             match line {
                 Ok(as_string) => {
-                    if let Some(n) = line_parser(&self, &as_string) {
+                    if let Some(n) = line_parser(self, &as_string) {
                         match &self.range {
                             Some(range) => {
                                 if range.contains(&n) {
@@ -55,9 +55,9 @@ impl DataReader {
         match self.regex.as_ref().unwrap().captures(line) {
             Some(cap) => {
                 if let Some(name) = cap.name("value") {
-                    self.parse_float(&name.as_str())
+                    self.parse_float(name.as_str())
                 } else if let Some(capture) = cap.get(1) {
-                    self.parse_float(&capture.as_str())
+                    self.parse_float(capture.as_str())
                 } else {
                     None
                 }
@@ -86,6 +86,26 @@ impl DataReader {
         }
         MatchBar::new(rows)
     }
+
+    pub fn read_terms(&self, path: &str, lines: usize) -> CommonTerms {
+        let mut terms = CommonTerms::new(lines);
+        let regex = self.regex.as_ref().unwrap();
+        for line in open_file(path).lines() {
+            match line {
+                Ok(as_string) => {
+                    if let Some(cap) = regex.captures(&as_string) {
+                        if let Some(name) = cap.name("value") {
+                            terms.observe(String::from(name.as_str()))
+                        } else if let Some(capture) = cap.get(1) {
+                            terms.observe(String::from(capture.as_str()))
+                        }
+                    };
+                }
+                Err(error) => error!("{}", error),
+            }
+        }
+        terms
+    }
 }
 
 #[cfg(test)]
@@ -180,4 +200,29 @@ mod tests {
         assert_eq!(mb.vec[2].label, "data");
         assert_eq!(mb.vec[2].count, 2);
     }
+
+    #[test]
+    fn basic_term_reader() {
+        let re = Regex::new("^foo ([0-9.-]+) (?P<value>[0-9.-]+)").unwrap();
+        let reader = DataReaderBuilder::default().regex(re).build().unwrap();
+        let mut file = NamedTempFile::new().unwrap();
+        writeln!(file, "foo 1.1 1.6").unwrap();
+        writeln!(file, "foo 1.2 1.5").unwrap();
+        writeln!(file, "foo 1.3 1.6").unwrap();
+        writeln!(file, "foo 1.4 1.7").unwrap();
+        let ct = reader.read_terms(file.path().to_str().unwrap(), 10);
+        assert_eq!(ct.terms.len(), 3);
+        assert_eq!(*ct.terms.get(&String::from("1.5")).unwrap(), 1);
+        assert_eq!(*ct.terms.get(&String::from("1.6")).unwrap(), 2);
+        assert_eq!(*ct.terms.get(&String::from("1.7")).unwrap(), 1);
+        // Now, with no named capture group
+        let re = Regex::new("^foo ([0-9.-]+) ([0-9.-]+)").unwrap();
+        let reader = DataReaderBuilder::default().regex(re).build().unwrap();
+        let ct = reader.read_terms(file.path().to_str().unwrap(), 10);
+        assert_eq!(ct.terms.len(), 4);
+        assert_eq!(*ct.terms.get(&String::from("1.1")).unwrap(), 1);
+        assert_eq!(*ct.terms.get(&String::from("1.2")).unwrap(), 1);
+        assert_eq!(*ct.terms.get(&String::from("1.3")).unwrap(), 1);
+        assert_eq!(*ct.terms.get(&String::from("1.4")).unwrap(), 1);
+    }
 }
diff --git a/src/read/dateparser.rs b/src/read/dateparser.rs
index efe94e7..2e61fbd 100644
--- a/src/read/dateparser.rs
+++ b/src/read/dateparser.rs
@@ -30,8 +30,8 @@ pub struct LogDateParser {
 impl LogDateParser {
     pub fn new(log_line: &str, format_string: &Option<String>) -> Result<LogDateParser, String> {
         match format_string {
-            Some(ts_format) => Self::new_with_format(&log_line, &ts_format),
-            None => Self::new_with_guess(&log_line),
+            Some(ts_format) => Self::new_with_format(log_line, ts_format),
+            None => Self::new_with_guess(log_line),
         }
     }
 
@@ -98,7 +98,7 @@ impl LogDateParser {
             return Some(Box::new(DateTime::parse_from_rfc3339));
         } else if DateTime::parse_from_rfc2822(s).is_ok() {
             return Some(Box::new(DateTime::parse_from_rfc2822));
-        } else if Self::looks_like_timestamp(&s) {
+        } else if Self::looks_like_timestamp(s) {
             return Some(Box::new(|string: &str| {
                 let dot = match string.find('.') {
                     Some(x) => x,
diff --git a/src/read/times.rs b/src/read/times.rs
index bc61702..bae0ee9 100644
--- a/src/read/times.rs
+++ b/src/read/times.rs
@@ -82,7 +82,7 @@ impl TimeReader {
             }
         }
         if let Some(re) = &self.regex {
-            if re.is_match(&line) {
+            if re.is_match(line) {
                 vec.push(d);
             }
         } else {