salut! Dans cet article, j'essaierai de décrire ce qu'est le filtre Bloom, d'expliquer son objectif et de montrer les scénarios dans lesquels il peut être utilisé. J'implémente également le filtre Bloom en Python à partir de zéro pour faciliter la compréhension de ses composants internes.
Objectif du filtre Bloom
— , — , ( , O , — O(1)
). , , , . , — : 100% , , 100% , , ( ). , Python, , !
. , .
, . , ( , ). uplink 100 /. IP-, . , , 100 /, (O(log(n))
), , IP-, , IP- . , , IP- .
— . , , , : , ; (, ) , .
, , , , .
?
, . , 100 IP-. , IP- , — 100 , — IP. IP- , «1», — «0».
4- IP- , .
IP-?
, 100 IP. IPv4- 32 , , 4 294 967 296 (2^32) ( , , , )! IP- , , . , . IP- . - -.
-
- — , . , - IP-, , . - , , .
- , ( IP), . , .
… - . . , 100 IP-. - 100 2^32 IP- 100 - ? , . . - , IP- , , 4 294 967 296 (2^32) IP-, . , -, — , , . , - 192.168.1.1 192.168.1.2, , , , ( , ).
. IP- , , .
, : 100 IP-. IP- -, - , . , , IP- . — ?
, IP- 178.23.12.63 112.64.90.12 . IP , — . , IP- , , IP- . , ?
, , — , . 0, . , 1, - , . , , , , , IP .
, . — . (, , - , ), . , ( 1, ) (1-e^(m / n))
, m — , , n .
— -. , IP- -, .. 1. k
-, (1-e^(mk/n))^k
, , - (n/m)*ln(2)
( ).
-. IP- , - , IP 112.64.90.12 , 1.
, Python ! 50 .
BloomFilter
( ). ( ) , . bitarray
, . , - , -, .
import math
from bitarray import bitarray
class BloomFilter(object):
def __init__(self, size, number_expected_elements=100000):
self.size = size
self.number_expected_elements = number_expected_elements
self.bloom_filter = bitarray(self.size)
self.bloom_filter.setall(0)
self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))
def _hash_djb2(self, s):
hash = 5381
for x in s:
hash = ((hash << 5) + hash) + ord(x)
return hash % self.size
-, K -? . , -, -. -. - , -. , ?
def _hash(self, item, K):
return self._hash_djb2(str(K) + item)
. -, , , 1 ( True) .
def add_to_filter(self, item):
for i in range(self.number_hash_functions):
self.bloom_filter[self._hash(item, i)] = 1
, , . -. - 0, , . , 1, , .
def check_is_not_in_filter(self, item):
for i in range(self.number_hash_functions):
if self.bloom_filter[self._hash(item, i)] == 0:
return True
return False
! . !
, , . 1 , 100 000. «192.168.1.1» IP-.
bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))
, i 1 100 000 , IP 192.168.1.i ( IP-, i>254, 192.168.289, ). , , ; , , .
for i in range(1, 100000):
if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
print(base_ip+str(i))
192.168.1.1
! , 100 000 IP- , , — IP-. !
:
import math
from bitarray import bitarray
class BloomFilter(object):
def __init__(self, size, number_expected_elements=100000):
self.size = size
self.number_expected_elements = number_expected_elements
self.bloom_filter = bitarray(self.size)
self.bloom_filter.setall(0)
self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))
def _hash_djb2(self, s):
hash = 5381
for x in s:
hash = ((hash << 5) + hash) + ord(x)
return hash % self.size
def _hash(self, item, K):
return self._hash_djb2(str(K) + item)
def add_to_filter(self, item):
for i in range(self.number_hash_functions):
self.bloom_filter[self._hash(item, i)] = 1
def check_is_not_in_filter(self, item):
for i in range(self.number_hash_functions):
if self.bloom_filter[self._hash(item, i)] == 0:
return True
return False
bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))
for i in range(1, 100000):
if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
print(base_ip+str(i))
, . , , . !
- «ML Spark». ML Spark, production