直接上代码进行演示,对比三种方式耗时情况!

示例代码1:   【循环读取数据,一条一条插入es数据库】

import re
import time
from elasticsearch import Elasticsearch

# 默认连接本地elasticsearch
es = Elasticsearch("http://localhost:9200")

# 将文件所有内容读取到此字符串中
all_str = ''
# 此列表每一个元素均为一整个<entry>  </entry>全部内容
valid_list = []
# 保存所有的entry_touple元组  {id:entry_touple}
tuple_dict = {}
# 将元组一个个存进去
tuple_list = []
# 字典的索引id
id = []
# 保存文件读取行数  用于判断文件是否成功完全读取
total_num = 0

# 开始计时
time_start = time.time()
# 打开原始文件
file = open('./2007.xml', "r", encoding='UTF-8')

# 打开旧文件 逐行读
for line in file.readlines():
    total_num += 1
    line = line.replace('\n', '')  # 将回车全部去除
    line = line.replace(' ', '')  # 将空格全部去除
    all_str += line

# 结束计时
time_end = time.time()
print("共处理了", total_num, "行xml数据")
print("文件所有字符(字符串)长度为:", len(all_str))
print("文件处理花费了:", time_end - time_start, "秒")

# 正则表达式取字符串  <entry>  </entry>
re_str_entry = r'<entry>(.+?)</entry>'
d = re.compile(re_str_entry)
# 取出原文件中所有的  <entry>  </entry>  保存到列表中
list_entry = d.findall(all_str)
print('共有' + str(len(list_entry)) + '条<entry>数据')
sums = len(list_entry)

time_end2 = time.time()
print("取正则表达式取字符串<entry>  </entry>花费了:", time_end2 - time_end, "秒")

# 遍历每一个entry将其中数据取出来
for data in list_entry:
    # 正则表达式取字符串<name>  </name>
    re_str_name = r'<name>(.+?)</name>'
    d = re.compile(re_str_name)
    # 取出每个<entry>  </entry>中的name字段
    name = d.findall(data)
    # print(name)

    # 正则表达式取字符串<vuln-id>  </vuln-id>
    re_str_vul_id = r'<vuln-id>(.+?)</vuln-id>'
    d = re.compile(re_str_vul_id)
    vuln_id = d.findall(data)
    # print(vuln_id)

    # 正则表达式取字符串<published>  </published>
    re_str_published = r'<published>(.+?)</published>'
    d = re.compile(re_str_published)
    published = d.findall(data)
    # print(published)

    # 正则表达式取字符串<modified>  </modified>
    re_str_modified = r'<modified>(.+?)</modified>'
    d = re.compile(re_str_modified)
    modified = d.findall(data)
    # print(modified)

    # 正则表达式取字符串<source>  </source>
    re_str_source = r'<source>(.+?)</source>'
    d = re.compile(re_str_source)
    source = d.findall(data)
    # print(source)

    # 正则表达式取字符串<severity>  </severity>
    re_str_severity = r'<severity>(.+?)</severity>'
    d = re.compile(re_str_severity)
    severity = d.findall(data)
    # print(severity)

    # 正则表达式取字符串 <vuln-type>  </vuln-type>
    re_str_vuln_type = r'<vuln-type>(.+?)</vuln-type>'
    d = re.compile(re_str_vuln_type)
    # 取出每个<vuln-type>  </vuln-type>中的字段
    vuln_type = d.findall(data)
    # print(vuln_type)

    # 正则表达式取字符串  <thrtype>  </thrtype>
    re_str_thrtype = r'<thrtype>(.+?)</thrtype>'
    d = re.compile(re_str_thrtype)
    thrtype = d.findall(data)
    # print(thrtype)

    # 正则表达式取字符串  <vuln-descript>  </vuln-descript>
    re_str_vuln_descript = r'<vuln-descript>(.+?)</vuln-descript>'
    d = re.compile(re_str_vuln_descript)
    vuln_descript = d.findall(data)
    # print(vuln_descript)

    # 正则表达式取字符串  <product>  </product>
    re_str_product = r'<product>CPE:/(.+?):</product>'
    d = re.compile(re_str_product)
    # 注意product可能有多个
    product = d.findall(data)
    # print(product)

    # 正则表达式取字符串  <vuln-solution>  </vuln-solution>
    re_str_vuln_solution = r'<vuln-solution>(.+?)</vuln-solution>'
    d = re.compile(re_str_vuln_solution)
    # 注意vuln_solution可能有多个
    vuln_solution = d.findall(data)
    # print(vuln_solution)

    entry_tuple = (
        name, vuln_id, published, modified, source, severity, vuln_type, thrtype, vuln_descript, product, vuln_solution)
    # 将每一个<entry>  </entry>中的数据按既定顺序存到一个元祖中
    # 下一步再将所有的元组放到一个字典中,最后将字典的数据一条条放到数据库中
    # print(entry_tuple[0:-1])
    tuple_list.append(entry_tuple[0:])

print(len(tuple_list))
# 制作一个和tuple_list一样长的列表
for i in range(len(tuple_list)):
    id.append(i)

# 形成字典类型
cve_dict = dict(zip(id, tuple_list))

time_end3 = time.time()
print("循环处理所有的<entry>花费了:", time_end3 - time_end2, "秒")

# 开始写入es数据库
for i in range(len(tuple_list)):
    es.index(index='entry', id=i, body={
        'name': cve_dict[i][0],
        "vuln-id": cve_dict[i][1],
        "published": cve_dict[i][2],
        "modified": cve_dict[i][3],
        "source": cve_dict[i][4],
        "severity": cve_dict[i][5],
        "vuln-type": cve_dict[i][6],
        "thrtype": cve_dict[i][7],
        "vuln-descript": cve_dict[i][8],
        "product": cve_dict[i][9][0:],
        "vuln-solution": cve_dict[i][10]})

print("向数据库中插入花费了:", time.time() - time_end3, "秒")
file.close()

运行结果:

示例代码2:  【循环读取数据,批量处理插入es数据库】

import re
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers  # 批量处理数据

# 默认连接本地elasticsearch
es = Elasticsearch("http://localhost:9200")

# 将文件所有内容读取到此字符串中
all_str = ''
# 此列表每一个元素均为一整个<entry>  </entry>全部内容
valid_list = []
# 保存所有的entry_touple元组  {id:entry_touple}
tuple_dict = {}
# 将元组一个个存进去
tuple_list = []
# 字典的索引id
id = []
# 保存文件读取行数  用于判断文件是否成功完全读取
total_num = 0

# 开始计时
time_start = time.time()
# 打开原始文件
file = open('./2007.xml', "r", encoding='UTF-8')

# 打开旧文件 逐行读
for line in file.readlines():
    total_num += 1
    line = line.replace('\n', '')  # 将回车全部去除
    line = line.replace(' ', '')  # 将空格全部去除
    all_str += line

# 结束计时
time_end = time.time()
print("共处理了", total_num, "行xml数据")
print("文件所有字符(字符串)长度为:", len(all_str))
print("文件处理花费了:", time_end - time_start, "秒")

# 正则表达式取字符串  <entry>  </entry>
re_str_entry = r'<entry>(.+?)</entry>'
d = re.compile(re_str_entry)
# 取出原文件中所有的  <entry>  </entry>  保存到列表中
list_entry = d.findall(all_str)
print('共有' + str(len(list_entry)) + '条<entry>数据')
sums = len(list_entry)

time_end2 = time.time()
print("取正则表达式取字符串<entry>  </entry>花费了:", time_end2 - time_end, "秒")

# 遍历每一个entry将其中数据取出来
for data in list_entry:
    # 正则表达式取字符串<name>  </name>
    re_str_name = r'<name>(.+?)</name>'
    d = re.compile(re_str_name)
    # 取出每个<entry>  </entry>中的name字段
    name = d.findall(data)
    # print(name)

    # 正则表达式取字符串<vuln-id>  </vuln-id>
    re_str_vul_id = r'<vuln-id>(.+?)</vuln-id>'
    d = re.compile(re_str_vul_id)
    vuln_id = d.findall(data)
    # print(vuln_id)

    # 正则表达式取字符串<published>  </published>
    re_str_published = r'<published>(.+?)</published>'
    d = re.compile(re_str_published)
    published = d.findall(data)
    # print(published)

    # 正则表达式取字符串<modified>  </modified>
    re_str_modified = r'<modified>(.+?)</modified>'
    d = re.compile(re_str_modified)
    modified = d.findall(data)
    # print(modified)

    # 正则表达式取字符串<source>  </source>
    re_str_source = r'<source>(.+?)</source>'
    d = re.compile(re_str_source)
    source = d.findall(data)
    # print(source)

    # 正则表达式取字符串<severity>  </severity>
    re_str_severity = r'<severity>(.+?)</severity>'
    d = re.compile(re_str_severity)
    severity = d.findall(data)
    # print(severity)

    # 正则表达式取字符串 <vuln-type>  </vuln-type>
    re_str_vuln_type = r'<vuln-type>(.+?)</vuln-type>'
    d = re.compile(re_str_vuln_type)
    # 取出每个<vuln-type>  </vuln-type>中的字段
    vuln_type = d.findall(data)
    # print(vuln_type)

    # 正则表达式取字符串  <thrtype>  </thrtype>
    re_str_thrtype = r'<thrtype>(.+?)</thrtype>'
    d = re.compile(re_str_thrtype)
    thrtype = d.findall(data)
    # print(thrtype)

    # 正则表达式取字符串  <vuln-descript>  </vuln-descript>
    re_str_vuln_descript = r'<vuln-descript>(.+?)</vuln-descript>'
    d = re.compile(re_str_vuln_descript)
    vuln_descript = d.findall(data)
    # print(vuln_descript)

    # 正则表达式取字符串  <product>  </product>
    re_str_product = r'<product>CPE:/(.+?):</product>'
    d = re.compile(re_str_product)
    # 注意product可能有多个
    product = d.findall(data)
    # print(product)

    # 正则表达式取字符串  <vuln-solution>  </vuln-solution>
    re_str_vuln_solution = r'<vuln-solution>(.+?)</vuln-solution>'
    d = re.compile(re_str_vuln_solution)
    # 注意vuln_solution可能有多个
    vuln_solution = d.findall(data)
    # print(vuln_solution)

    entry_tuple = (
        name, vuln_id, published, modified, source, severity, vuln_type, thrtype, vuln_descript, product, vuln_solution)
    # 将每一个<entry>  </entry>中的数据按既定顺序存到一个元祖中
    # 下一步再将所有的元组放到一个字典中,最后将字典的数据一条条放到数据库中
    # print(entry_tuple[0:-1])
    tuple_list.append(entry_tuple[0:])

print(len(tuple_list))
# 制作一个和tuple_list一样长的列表
for i in range(len(tuple_list)):
    id.append(i)

# 形成字典类型
cve_dict = dict(zip(id, tuple_list))

time_end3 = time.time()
print("循环处理所有的<entry>花费了:", time_end3 - time_end2, "秒")

# 开始写入es数据库
# 批量写入数据
action = [
    {
        "_index": "entry_bulk",
        "_type": "doc",
        "_source": {
            "id": i,
            'name': cve_dict[i][0],
            "vuln-id": cve_dict[i][1],
            "published": cve_dict[i][2],
            "modified": cve_dict[i][3],
            "source": cve_dict[i][4],
            "severity": cve_dict[i][5],
            "vuln-type": cve_dict[i][6],
            "thrtype": cve_dict[i][7],
            "vuln-descript": cve_dict[i][8],
            "product": cve_dict[i][9][0:],
            "vuln-solution": cve_dict[i][10]
        }
    } for i in range(len(tuple_list))]
helpers.bulk(es, action)

print("向数据库中插入花费了:", time.time() - time_end3, "秒")
file.close()

运行结果:

示例代码3:   【直接一次性读取数据,批量处理插入es数据库】

import re
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers  # 批量处理数据

# 默认连接本地elasticsearch
es = Elasticsearch("http://localhost:9200")

# 将文件所有内容读取到此字符串中
all_str = ''
# 此列表每一个元素均为一整个<entry>  </entry>全部内容
valid_list = []
# 保存所有的entry_touple元组  {id:entry_touple}
tuple_dict = {}
# 将元组一个个存进去
tuple_list = []
# 字典的索引id
id = []
# 保存文件读取行数  用于判断文件是否成功完全读取
total_num = 1

# 开始计时
time_start = time.time()
# 打开原始文件
file = open('./2007.xml', "r", encoding='UTF-8')

# # 打开旧文件 逐行读
# for line in file.readlines():
#     total_num += 1
#     line = line.replace('\n', '')  # 将回车全部去除
#     line = line.replace(' ', '')  # 将空格全部去除
#     all_str += line

# 将整个文件内容读取出来,存到all_str字符串变量中
data = file.read()
data = data.replace('\n', '')
all_str = data.replace(' ', '')

# 结束计时
time_end = time.time()
print("共处理了", total_num, "行xml数据")
print("文件所有字符(字符串)长度为:", len(all_str))
print("文件处理花费了:", time_end - time_start, "秒")

# 正则表达式取字符串  <entry>  </entry>
re_str_entry = r'<entry>(.+?)</entry>'
d = re.compile(re_str_entry)
# 取出原文件中所有的  <entry>  </entry>  保存到列表中
list_entry = d.findall(all_str)
print('共有' + str(len(list_entry)) + '条<entry>数据')
sums = len(list_entry)

time_end2 = time.time()
print("取正则表达式取字符串<entry>  </entry>花费了:", time_end2 - time_end, "秒")

# 遍历每一个entry将其中数据取出来
for data in list_entry:
    # 正则表达式取字符串<name>  </name>
    re_str_name = r'<name>(.+?)</name>'
    d = re.compile(re_str_name)
    # 取出每个<entry>  </entry>中的name字段
    name = d.findall(data)
    # print(name)

    # 正则表达式取字符串<vuln-id>  </vuln-id>
    re_str_vul_id = r'<vuln-id>(.+?)</vuln-id>'
    d = re.compile(re_str_vul_id)
    vuln_id = d.findall(data)
    # print(vuln_id)

    # 正则表达式取字符串<published>  </published>
    re_str_published = r'<published>(.+?)</published>'
    d = re.compile(re_str_published)
    published = d.findall(data)
    # print(published)

    # 正则表达式取字符串<modified>  </modified>
    re_str_modified = r'<modified>(.+?)</modified>'
    d = re.compile(re_str_modified)
    modified = d.findall(data)
    # print(modified)

    # 正则表达式取字符串<source>  </source>
    re_str_source = r'<source>(.+?)</source>'
    d = re.compile(re_str_source)
    source = d.findall(data)
    # print(source)

    # 正则表达式取字符串<severity>  </severity>
    re_str_severity = r'<severity>(.+?)</severity>'
    d = re.compile(re_str_severity)
    severity = d.findall(data)
    # print(severity)

    # 正则表达式取字符串 <vuln-type>  </vuln-type>
    re_str_vuln_type = r'<vuln-type>(.+?)</vuln-type>'
    d = re.compile(re_str_vuln_type)
    # 取出每个<vuln-type>  </vuln-type>中的字段
    vuln_type = d.findall(data)
    # print(vuln_type)

    # 正则表达式取字符串  <thrtype>  </thrtype>
    re_str_thrtype = r'<thrtype>(.+?)</thrtype>'
    d = re.compile(re_str_thrtype)
    thrtype = d.findall(data)
    # print(thrtype)

    # 正则表达式取字符串  <vuln-descript>  </vuln-descript>
    re_str_vuln_descript = r'<vuln-descript>(.+?)</vuln-descript>'
    d = re.compile(re_str_vuln_descript)
    vuln_descript = d.findall(data)
    # print(vuln_descript)

    # 正则表达式取字符串  <product>  </product>
    re_str_product = r'<product>CPE:/(.+?):</product>'
    d = re.compile(re_str_product)
    # 注意product可能有多个
    product = d.findall(data)
    # print(product)

    # 正则表达式取字符串  <vuln-solution>  </vuln-solution>
    re_str_vuln_solution = r'<vuln-solution>(.+?)</vuln-solution>'
    d = re.compile(re_str_vuln_solution)
    # 注意vuln_solution可能有多个
    vuln_solution = d.findall(data)
    # print(vuln_solution)

    entry_tuple = (
        name, vuln_id, published, modified, source, severity, vuln_type, thrtype, vuln_descript, product, vuln_solution)
    # 将每一个<entry>  </entry>中的数据按既定顺序存到一个元祖中
    # 下一步再将所有的元组放到一个字典中,最后将字典的数据一条条放到数据库中
    # print(entry_tuple[0:-1])
    tuple_list.append(entry_tuple[0:])

print(len(tuple_list))
# 制作一个和tuple_list一样长的列表
for i in range(len(tuple_list)):
    id.append(i)

# 形成字典类型
cve_dict = dict(zip(id, tuple_list))

time_end3 = time.time()
print("循环处理所有的<entry>花费了:", time_end3 - time_end2, "秒")

# 开始写入es数据库
# 批量写入数据
action = [
    {
        "_index": "fask_entry_bulk",
        "_type": "doc",
        "_source": {
            "id": i,
            'name': cve_dict[i][0],
            "vuln-id": cve_dict[i][1],
            "published": cve_dict[i][2],
            "modified": cve_dict[i][3],
            "source": cve_dict[i][4],
            "severity": cve_dict[i][5],
            "vuln-type": cve_dict[i][6],
            "thrtype": cve_dict[i][7],
            "vuln-descript": cve_dict[i][8],
            "product": cve_dict[i][9][0:],
            "vuln-solution": cve_dict[i][10]
        }
    } for i in range(len(tuple_list))]
helpers.bulk(es, action)

print("向数据库中插入花费了:", time.time() - time_end3, "秒")
file.close()

运行结果:

插入数据后在数据库中查看:

Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐