Qu'est-ce que Bloom Filter?





salut! Dans cet article, j'essaierai de décrire ce qu'est le filtre Bloom, d'expliquer son objectif et de montrer les scénarios dans lesquels il peut être utilisé. J'implémente également le filtre Bloom en Python à partir de zéro pour faciliter la compréhension de ses composants internes.





Objectif du filtre Bloom 

— , — , ( , O , — O(1)



). , , , . , — : 100% , , 100% , , ( ). , Python, , !





. , .





, . , ( , ). uplink 100 /. IP-, . , , 100 /, (O(log(n))



), , IP-, , IP- . , , IP- .





— . , , , : , ​​ ; (, ) , .





, , , , .





.





?

, . , 100 IP-. , IP- , — 100 , — IP. IP- , «1», — «0».





4- IP- , .





IP-?

, 100 IP. IPv4- 32 , , 4 294 967 296 (2^32) ( , , , )! IP- , , . , . IP- . - -.





-

- — , . , - IP-, , . - , , .





- , ( IP), . , .





… - . . , 100 IP-. - 100 2^32 IP- 100 - ? , . . - , IP- , , 4 294 967 296 (2^32) IP-, . , -, — , , . , - 192.168.1.1 192.168.1.2, , , , ( , ).





. IP- , , .





, : 100 IP-. IP- -, - , . , , IP- . — ?





, IP- 178.23.12.63 112.64.90.12 . IP , — . , IP- , , IP- . , ?





, , — , . 0, . , 1, - , . , , , , , IP .





, . — . (, , - , ), . , ( 1, ) (1-e^(m / n))



, m — , , n .





— -. , IP- -, .. 1. k



-, (1-e^(mk/n))^k



, , - (n/m)*ln(2)



( ).





-. IP- , - , IP 112.64.90.12 , 1.





, Python ! 50 .





BloomFilter



( ). ( ) , . bitarray



, . , - , -, .





import math
from bitarray import bitarray

class BloomFilter(object):

    def __init__(self, size, number_expected_elements=100000):
        self.size = size
        self.number_expected_elements = number_expected_elements

        self.bloom_filter = bitarray(self.size)
        self.bloom_filter.setall(0)

        self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))
      
      



- . ( ) DJB2. , .





def _hash_djb2(self, s):
        hash = 5381
        for x in s:
            hash = ((hash << 5) + hash) + ord(x)
        return hash % self.size
      
      



-, K -? . , -, -. -. - , -. , ?





def _hash(self, item, K):
        return self._hash_djb2(str(K) + item)
      
      



. -, , , 1 ( True) .





def add_to_filter(self, item):
        for i in range(self.number_hash_functions):
            self.bloom_filter[self._hash(item, i)] = 1
      
      



, , . -. - 0, , . , 1, , .





 def check_is_not_in_filter(self, item):
        for i in range(self.number_hash_functions):
            if self.bloom_filter[self._hash(item, i)] == 0:
                return True
        return False
      
      



! . !





, , . 1 , 100 000. «192.168.1.1» IP-.





bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))
      
      



, i 1 100 000 , IP 192.168.1.i ( IP-, i>254, 192.168.289, ). , , ; , , .





for i in range(1, 100000):
    if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
        print(base_ip+str(i))
      
      



192.168.1.1





! , 100 000 IP- , , — IP-. !





:





import math
from bitarray import bitarray


class BloomFilter(object):

    def __init__(self, size, number_expected_elements=100000):
        self.size = size
        self.number_expected_elements = number_expected_elements

        self.bloom_filter = bitarray(self.size)
        self.bloom_filter.setall(0)

        self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))


    def _hash_djb2(self, s):
        hash = 5381
        for x in s:
            hash = ((hash << 5) + hash) + ord(x)
        return hash % self.size


    def _hash(self, item, K):
        return self._hash_djb2(str(K) + item)


    def add_to_filter(self, item):
        for i in range(self.number_hash_functions):
            self.bloom_filter[self._hash(item, i)] = 1


    def check_is_not_in_filter(self, item):
        for i in range(self.number_hash_functions):
            if self.bloom_filter[self._hash(item, i)] == 0:
                return True
        return False


bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))

for i in range(1, 100000):
    if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
        print(base_ip+str(i))
      
      



, . , , . !






«Data Engineer».









- «ML Spark». ML Spark, production












All Articles