Skip to content

Instantly share code, notes, and snippets.

@leisurelicht
Last active June 3, 2019 15:53
Show Gist options
  • Save leisurelicht/d7d0005abdf8b743f90bc99ba35ac0d2 to your computer and use it in GitHub Desktop.
Save leisurelicht/d7d0005abdf8b743f90bc99ba35ac0d2 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
"""
面试题
Q:有一个大文件日志,日志内容包含 访问时间 和 访问 IP,问如何统计每分钟访问次数超过 100 次的 IP ?
访问日志文件内的数据都是根据时间有序排列的。所以只要逐行处理,把秒去掉,然后利用字典统计每分钟内的IP访问次数。
到下一分钟了就把字典清空,重新统计就可以了。
"""
import random
from datetime import datetime, timedelta
def genlog():
ips = []
for _ in range(50):
ips.append(".".join([str(random.randint(0, 255)),str(random.randint(0, 255)),str(random.randint(0, 255)),str(random.randint(0, 255))]))
f = open("sample.log", "w")
for minute in range(59, -1, -1):
for i in range(0, 60):
second = 59 - i
for i in range(random.randint(0,150)):
tmp = datetime(2019, 5, 19, 0, minute, second)
f.write("{}\t{}\n".format(tmp.strftime("%Y-%m-%d %H:%M:%S"), random.choice(ips)))
f.close()
def main():
current = None
container = {}
exceed = set()
with open("sample.log", "r") as f:
for i in f:
ds, ip = i.strip().split("\t") # 逐行读取后把时间和ip取出来
minute = ds[:16] # 只保留分钟级的精度
if minute != current: # 统计到下一分钟了,就把访问超过100次的ip挑出来,然后把字典清空
for ip, count in container.items():
if count >= 100:
exceed.add(ip)
current = minute
container = {}
try:
container[ip] += 1
except:
container[ip] = 1
return exceed
if __name__ == "__main__":
genlog()
# print("log gen over")
print(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment