机器学习样本标记 示意代码

目标:根据各个字段数据的分布(例如srcIP和dstIP的top 10)以及其他特征来进行样本标注,最终将几类样本分别标注在black/white/ddos/mddos/cdn/unknown几类。

效果示意:

————-choose one————–
sub domain: DNSQueryName(N)
ip: srcip(S) or dstip(D)
length: DNSRequestLength(R1) or DNSReplyLength(R2)
length too: DNSRequestErrLength(R3) or DNSReplyErrLength(R4)
port: sourcePort(P1) or destPort(P2) or DNSReplyTTL(T)
code: DNSReplyCode(C2) or DNSRequestRRType(C1)
other: DNSRRClass(RR) or DNSReplyIPv4(V)
————-label or quit————
black(B) or white(W) or cdn(CDN) or ddos(DDOS) or mddos(M) or unknown(U) or white-like(L)
next(Q) or exit(E)?
***************************************
domain: workgroup. flow count: 206
***************************************
————srcip—————–
count                 206
unique                  9
top       162.105.129.122
freq                  150
Name: sourceIP, dtype: object
————–destip—————
count             206
unique             12
top       199.7.83.42
freq               82
Name: destIP, dtype: object

 

代码:

import sys
import json
import os
import pandas as pd
import tldextract
# import numpy as np


medata_field = '''
3 = sourceIP
4 = destIP
5 = sourcePort
6 = destPort
7 = protocol
12 = flowStartSeconds
13 = flowEndSecond
54 = DNSReplyCode
55 = DNSQueryName
56 = DNSRequestRRType
57 = DNSRRClass
58 = DNSDelay
59 = DNSReplyTTL
60 = DNSReplyIPv4
61 = DNSReplyIPv6
62 = DNSReplyRRType
77 = DNSReplyName
81 = payload
88 = DNSRequestLength
89 = DNSRequestErrLength
90 = DNSReplyLength
91 = DNSReplyErrLength
'''

medata_field_num = []
medata_field_info = []
for l in medata_field.split("\n"):
    if len(l) == 0: continue
    num, info = l.split(" = ")
    medata_field_num.append(int(num)-1)
    medata_field_info.append(info)
print medata_field_num
print medata_field_info


def extract_domain(domain):
    try:
        ext = tldextract.extract(domain)
        subdomain = ext.subdomain
        if ext.domain == "":
            mdomain = ext.suffix
        else:
            mdomain = ".".join(ext[1:])
        return mdomain
    except Exception,e:
        print "extract_domain error:", e
        return "unknown"


def parse_metadata(path):
    df = pd.read_csv(path, sep="^", header=None)
    dns_df = df.iloc[:, medata_field_num].copy()
    dns_df.columns = medata_field_info
    # print dns_df.tail()

    dns_df["mdomain"] = dns_df["DNSQueryName"].apply(extract_domain)
    # print dns_df.groupby('mdomain').describe()
    # print dns_df.groupby('mdomain').groups
    return dns_df.groupby('mdomain')


def get_data_dist(df, col="sourceIP"):
    # group count by ip dist
    grouped = df.groupby(col)
    # print grouped.head(10)[col]
    print type(grouped.size())
    size = grouped.size()
    print size
    print "-----------top 10-------------"
    print size.nlargest(10)


def get_ipv4_dist(df, col="DNSReplyLength"):
    # group count by ip dist
    df2 = df[df[col] > 0]
    print "filter before length:", len(df), "filter after length:", len(df2)
    grouped = df2.groupby(by="DNSReplyIPv4")
    # print grouped.head(10)[col]
    size = grouped.size()
    print size
    print "-----------top 10-------------"
    print size.nlargest(10)



def move_to(srcpath, domain, dst_path):
    with open(dst_path, "w") as w:
        with open(srcpath) as r:
            for line in r:
                if extract_domain(line.split("^")[55-1]) == domain:
                    w.write(line)


def main():
    history_op = {}
    if os.path.exists("history_op.json"):
        with open("history_op.json") as h:
            history_op = json.load(h)
            print history_op
    for day in range(24, 27):
        for hour in range(0, 24):
            path = "/home/bonelee/latest_metadata_sample/sampled/unknown_sample/debugdogcom-medata_wanted-2017-09-%d-%d.txt" % (day, hour)
            if not os.path.exists(path) or os.path.getsize(path) == 0:
                print path, "passed, file not exists or empty file."
                continue
            print path, "running..."
            try:
                domains_info = parse_metadata(path)
            except IOError, e:
                print e
                continue
            for domain, group in domains_info:
                print "***************************************"
                print "domain:", domain, "flow count:", len(group)
                print "***************************************"
                # print type(group) #<class 'pandas.core.frame.DataFrame'>
                print "------------srcip-----------------"
                print group["sourceIP"].describe()
                print "--------------destip---------------"
                print group["destIP"].describe()
                print "----------------------------------------"
                print "ipv4 address return dist:"
                get_ipv4_dist(group)
                print "----------------------------------------"

                has_judged = False
                need_break = False
                while True:
                    print "-------------choose one--------------"
                    print "sub domain: DNSQueryName(N)"
                    print "ip: srcip(S) or dstip(D)"
                    print "length: DNSRequestLength(R1) or DNSReplyLength(R2)"
                    print "length too: DNSRequestErrLength(R3) or DNSReplyErrLength(R4)"
                    print "port: sourcePort(P1) or destPort(P2) or DNSReplyTTL(T)"
                    print "code: DNSReplyCode(C2) or DNSRequestRRType(C1)"
                    print "other: DNSRRClass(RR) or DNSReplyIPv4(V)"
                    dist_dict = {"R1": "DNSRequestLength",
                     "R2": "DNSReplyLength",
                     "R3": "DNSRequestErrLength",
                     "R4": "DNSReplyErrLength",
                     "P1": "sourcePort",
                     "P2": "destPort",
                     "T": "DNSReplyTTL",
                     "C2": "DNSReplyCode",
                     "C1": "DNSRequestRRType",
                     "RR": "DNSRRClass",
                     "V": "DNSReplyIPv4",
                     "S": "sourceIP",
                     "D": "destIP",
                     "N": "DNSQueryName"
                     }

                    print "-------------label or quit------------"
                    print "black(B) or white(W) or cdn(CDN) or ddos(DDOS) or mddos(M) or unknown(U) or white-like(L)"
                    print "next(Q) or exit(E)?"
                    domain = domain.lower()
                    if "win" == domain[-len("win"):] or "site" == domain[-len("site"):] or "vip" == domain[-len("vip"):]:
                        check = "U"
                        need_break = True
                    elif "lan" in domain or "local" in domain or "dhcp" in domain or "workgroup" in domain or "home" in domain:
                        check = "DDOS"
                        need_break = True
                    elif "cdn" in domain:
                        check = "CDN"
                        need_break = True
                    else:
                        if domain in history_op and not has_judged:
                            print "found history op:", history_op[domain]
                            if not raw_input("OK(Enter for Y)?"):
                                check = history_op[domain]
                                need_break = True
                            else:
                                check = raw_input("Input:")
                        else:
                            check = raw_input("Input:")
                    has_judged = True
                    if check == "Q":
                        print path, "next OK!"
                        break
                    elif check == "E":
                        print path, "Exit!"
                        with open("history_op.json", "w") as f:
                            json.dump(history_op, f)
                            print "saved history_op.json"
                        sys.exit()
                    elif check == "B":
                        move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_black/2017-8-%d-%d-%s.txt" % (day, hour, domain))
                        history_op[domain] = "B"
                        print "Saved OK!"
                        if need_break: break
                    elif check == "W":
                        move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_white/2017-8-%d-%d-%s.txt" % (day, hour, domain))
                        history_op[domain] = "W"
                        print "Saved OK!"
                        if need_break: break
                    elif check == "L":
                        move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_white_like/2017-8-%d-%d-%s.txt" % (day, hour, domain))
                        history_op[domain] = "L"
                        print "Saved OK!"
                        if need_break: break
                    elif check == "CDN":
                        move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_cdn/2017-8-%d-%d-%s.txt" % (day, hour, domain))
                        history_op[domain] = "CDN"
                        print "Saved OK!"
                        if need_break: break
                    elif check == "DDOS":
                        move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_ddos/2017-8-%d-%d-%s.txt" % (day, hour, domain))
                        history_op[domain] = "DDOS"
                        print "Saved OK!"
                        if need_break: break
                    elif check == "M":
                        move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_mddos/2017-8-%d-%d-%s.txt" % (day, hour, domain))
                        history_op[domain] = "M"
                        print "Saved OK!"
                        if need_break: break
                    elif check == "U":
                        move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_unknown/2017-8-%d-%d-%s.txt" % (day, hour, domain))
                        history_op[domain] = "U"
                        print "Saved OK!"
                        if need_break: break
                    else:
                        if check in dist_dict:
                            get_data_dist(group, dist_dict[check])
                        else:
                            print "unknown input!Choose the following one:"
            print "*******************************"
            print path, "check over..."
            print "*******************************"


if __name__ == "__main__":
    main()

 

    原文作者:机器学习
    原文地址: https://www.cnblogs.com/bonelee/p/7608165.html
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞