Skip to content

Instantly share code, notes, and snippets.

@joshuar
Last active August 29, 2015 14:07
Show Gist options
  • Save joshuar/c37f310249d387aef530 to your computer and use it in GitHub Desktop.
Save joshuar/c37f310249d387aef530 to your computer and use it in GitHub Desktop.

Logstash custom patterns for Lustre

LUSTRE_OBJECT %{WORD}(-%{WORD}){1,3}
LUSTRE_LNET %{IP}@%{WORD}
LUSTRE_SOURCECODE (%{USERNAME}.c:%{INT})
LUSTRE_ERRCODE rc (=)? (%{INT:error_code}|%{INT}/%{INT})
LUSTRE_LOGPREFIX1 (Lustre|LustreError|LNetError): (%{WORD}-%{WORD}: )?%{LUSTRE_OBJECT:lustre_object}:
LUSTRE_LOGPREFIX2 (Lustre|LustreError|LNet|LNetError):%{SPACE}?%{WORD}:%{WORD}:\(%{LUSTRE_SOURCECODE:lustre_source}:%{USERNAME:lustre_function}\(\)\)
LUSTRE_LOGPREFIX3 (Lustre|LustreError|LNet|LNetError):
LUSTRE_LOGPREFIX (%{LUSTRE_LOGPREFIX1}|%{LUSTRE_LOGPREFIX2}|%{LUSTRE_LOGPREFIX3})

Logstash grok pattern matching for Lustre

       grok {
         patterns_dir => "/etc/logstash/patterns"
         match   => [ "message", "%{LUSTRE_LOGPREFIX} not available for connect from %{LUSTRE_LNET:lustre_host} \(no target\)",
                      "message", "%{LUSTRE_LOGPREFIX} This client was evicted by %{LUSTRE_OBJECT}; in progress operations using this service will fail.",
                      "message", "%{LUSTRE_LOGPREFIX} %{LUSTRE_OBJECT:lustre_object}: %{USERNAME} failed for resource %{DATA:lustre_resource}: %{LUSTRE_ERRCODE}",
                      "message", "%{LUSTRE_LOGPREFIX} %{LUSTRE_OBJECT:lustre_object}: error destroying precreated id %{DATA}:%{DATA}: %{LUSTRE_ERRCODE}",
                      "message", "%{LUSTRE_LOGPREFIX} Bulk IO write error with %{UUID:lustre_uuid} \(at %{LUSTRE_LNET:lustre_host}\), client will retry: %{LUSTRE_ERRCODE}",
                      "message", "%{LUSTRE_LOGPREFIX} @@@ %{GREEDYDATA}: %{LUSTRE_ERRCODE} %{GREEDYDATA} %{WORD}->%{UUID:lustre_uuid}@%{LUSTRE_LNET:lustre_host}:%{GREEDYDATA}",
                      "message", "%{LUSTRE_LOGPREFIX} Communicating with %{LUSTRE_LNET:lustre_host}, %{GREEDYDATA}",
                      "message", "%{LUSTRE_LOGPREFIX} Client %{UUID:lustre_uuid} \(at %{LUSTRE_LNET:lustre_host}\) reconnecting",
                      "message", "%{LUSTRE_LOGPREFIX} Client %{UUID:lustre_uuid} \(at %{LUSTRE_LNET:lustre_host}\) refused reconnection, %{GREEDYDATA}",
                      "message", "%{LUSTRE_LOGPREFIX} Connection restored to %{LUSTRE_OBJECT} \(at %{LUSTRE_LNET:lustre_host}\)",
                      "message", "%{LUSTRE_LOGPREFIX} binary \[%{DATA:lustre_resource}\] changed while waiting for the page fault lock",
                      "message", "%{LUSTRE_LOGPREFIX} %{GREEDYDATA}",
                      "message", "Lustre: Mounted %{LUSTRE_OBJECT:lustre_object}",
                      "message", "LDISKFS-fs \(%{USERNAME:device}\): %{GREEDYDATA}" ]
         add_tag => [ "lustre" ]
       }
       # Add extra tags to lustre messages based on type
       if "lustre" in [tags] {
         if [message] =~ /^LNet:/ {
           mutate {
             add_tag => [ "lnet" ]
             remove_tag => [ "lustre" ]
           }
         }
         if [message] =~ /^LNetError:/ {
           mutate {
             add_tag => [ "lnet_error" ]
             remove_tag => [ "lustre" ]
           }
         }
         if [message] =~ /^LustreError:/ {
           mutate {
             add_tag => [ "lustre_error" ]
             remove_tag => [ "lustre" ]
           }
         }
         # Set up an timer to count the time between
         # a lustre connection going down and coming
         # back
         # This is the start event
         if [message] =~ /was lost;/ {
           mutate {
             add_field => [ "lustre_conn_id", "%{syslog_hostname}:%{lustre_object}" ]
             add_tag => [ "lustre_conn_lost" ]
           }
         }
         # This is the end event
         if [message] =~ /Connection restored/ {
           mutate {
             add_field => [ "lustre_conn_id", "%{syslog_hostname}:%{lustre_object}" ]
             add_tag => [ "lustre_conn_restored" ]
           }
         }
         # This counts the elapsed time between start and end events
         elapsed {
           start_tag => "lustre_conn_lost"
           end_tag => "lustre_conn_restored"
           unique_id_field => "lustre_conn_id"
           timeout => 43200
         }
       }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment