Skip to content

Instantly share code, notes, and snippets.

@lokal-profil
Last active August 22, 2018 22:43
Show Gist options
  • Save lokal-profil/ce3625346a84474b842715084b64175c to your computer and use it in GitHub Desktop.
Save lokal-profil/ce3625346a84474b842715084b64175c to your computer and use it in GitHub Desktop.
Additional cleanup step for analysing WLM logs.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import re
def clean_log(filename):
import codecs
f_in = codecs.open(filename, 'r', 'utf-8')
f_out = codecs.open('{}.clean'.format(filename), 'w', 'utf-8')
bad_warnings = (
'WARNING: API warning (revisions): Because "rvslots" was not specified, a legacy format has been used for the output. This format is deprecated, and in the future the new format will always be used.',
'WARNING: API warning (main): Subscribe to the mediawiki-api-announce mailing list at <https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce> for notice of API deprecations and breaking changes. Use [[Special:ApiFeatureUsage]] to see usage of deprecated features by your application.',
'WARNING: /mnt/nfs/labstore-secondary-tools-project/heritage/.venv/local/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:132: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. You can upgrade to a newer version of Python to solve this. For more information, see https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings',
' InsecurePlatformWarning',
'WARNING: /mnt/nfs/labstore-secondary-tools-project/heritage/.venv/local/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:334: SNIMissingWarning: An HTTPS request has been made, but the SNI (Subject Name Indication) extension to TLS is not available on this platform. This may cause the server to present an incorrect TLS certificate, which can cause validation failures. You can upgrade to a newer version of Python to solve this. For more information, see https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings',
' SNIMissingWarning',
"WARNING: /data/project/heritage/pywikibot/pywikibot/data/api.py:310: UserWarning: Unexpected overlap between action and query submodules: frozenset([u'globalpreferences', u'readinglists'])"
)
bad_patterns = patterns = (
r'Page \[\[([^\]]*)\]\] saved\n',
r'WARNING: ([0-9]*) primkey\(s\) missing on ([^\n]*)\n',
r'Retrieving ([0-9]*) pages from ([^\.]*).\n'
)
counter_empty = 0
counter_warning = 0
counter_pattern = 0
for l in f_in.readlines():
found = False
if not l.strip():
counter_empty += 1
found = True
elif l.rstrip() in bad_warnings:
counter_warning += 1
found = True
else:
for pat in bad_patterns:
if re.search(pat, l):
counter_pattern += 1
found = True
break
if not found:
f_out.write(l)
print 'Removed: empty: {}, warnings: {}, patterns: {}'.format(counter_pattern, counter_warning, counter_pattern)
f_out.close()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("filename")
args = parser.parse_args()
clean_log(args.filename)
if __name__ == '__main__':
main()
@lokal-profil
Copy link
Author

This is python 2.7 since that is the default python on Toolforge

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment