In [ ]:
import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt 
%matplotlib inline
plt.style.use('ggplot')

import dateutil.parser
from dateutil.parser import parse
from datetime import timedelta

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
In [ ]:
import re
In [ ]:
import datetime

Data Cleaning

In [ ]:
lines = []
filepath = 'firewall.log'
with open(filepath) as fp:
    for cnt, line in enumerate(fp):
        
        cleaned_list = []
        results = re.split('\W+ ]?', line)
                
        date = results[0] + " " + results[1][0:-14]
        cleaned_list.append(date)
        ms = results[2][1:]
        cleaned_list.append(ms)
        block = results[3][1:]
        cleaned_list.append(block)
        incoming = results[4]
        cleaned_list.append(incoming)
        

        key_results = results[-1]   
        
        find_MAC = re.search("(MAC=(\d|\w|\:)*)", key_results)
        cleaned_MAC = (key_results[find_MAC.span()[0]:find_MAC.span()[1]])[4:]
        cleaned_list.append(cleaned_MAC)
        
        find_SRCip = re.search("(SRC=(\d|\.)*)", key_results)
        cleaned_SRCip = (key_results[find_SRCip.span()[0]:find_SRCip.span()[1]])[4:]
        cleaned_list.append(cleaned_SRCip)
        
        find_DPT = re.search("(DPT=\d*)", key_results)
        cleaned_DPT = (key_results[find_DPT.span()[0]:find_DPT.span()[1]])[4:]
        cleaned_list.append(cleaned_DPT)
        
        find_proto = re.search("(PROTO=\w*)", key_results)
        cleaned_proto = (key_results[find_proto.span()[0]:find_proto.span()[1]])[6:]
        cleaned_list.append(cleaned_proto)

        print(cleaned_list)
        lines.append(cleaned_list)

print(lines)
In [ ]:
df = pd.DataFrame(lines, columns=["date", "ms", "ufw", "IN","mac_address", "source_ip", "DPT", "protocol"])
In [ ]:
df

The time is 5 hours ahead! And it seems that the log file only captures a certain number of days at a time!

Exploratory Analysis

In [ ]:
df["source_ip"].value_counts().head(10)

The top IP addresses represent:

  • Zagreb, Croatia
  • Bucharest, Romania
  • Moscow, Russia
  • Kiev, Ukraine
  • Amsterdam, Netherlands
In [ ]:
df["DPT"].value_counts().head(10)
In [ ]:
df["protocol"].value_counts()
In [ ]:
def clean_date(date):
    
    dt = datetime.datetime.strptime("2019 "+date, '%Y %b %d %H:%M:%S')
    dt = dt - datetime.timedelta(hours=5)
    return dt
In [ ]:
# test
clean_date("Nov 3 06:27:48")
In [ ]:
df['datetime'] = df['date'].apply(clean_date)
In [ ]:
df
In [ ]:
df.index = df['datetime']
In [ ]:
df
In [ ]:
plt.figure(figsize=(15,5))
ax = plt.subplot(1, 1, 1)
df['ms'].resample('H').count().plot(ax=ax, legend=False)
plt.savefig("hourly_hits.png", transparent=True)
plt.show()
In [ ]:
plt.figure(figsize=(15,5))
ax = plt.subplot(1, 1, 1)

only_udp = df[df['protocol'] == 'UDP']
only_tcp = df[df['protocol'] == 'TCP']
only_udp['ms'].resample('H').count().plot(ax=ax, color='green')
only_tcp['ms'].resample('H').count().plot(ax=ax, color='red')
plt.savefig("hourly_hits_by_protocol.png", transparent=True)
plt.show()
In [ ]: