3 回答

TA贡献1804条经验 获得超2个赞
更新的最终答案:
我想出了以下符合我需要的代码。笔记:
该功能在许多方面都不是最好的,但它的工作非常好!
该函数有 3 种数据生成模式:生成组合总数、生成每个值在所有组合中出现次数最少的组合、生成每个值在所有组合中出现“最大”次数的组合 (" max”的意思是“尽可能接近最大值”)。
该功能允许在选定范围内动态更改组合的长度或提供特定数字。
根据参数,该函数可以执行冗余次数的迭代,如“错误总数”所示。但...
它很快!它使用集合和元组来获得出色的性能。当 itertools.combinations 触发返回吨(数百万)个组合时,可能会发生唯一的问题,但就我而言,到目前为止从未发生过。
编码:
import numpy as np
import random
import itertools
from decimal import Decimal
def get_random_combinations(values, exclude, length, mode, limit, min_ = 0, max_ = 0):
done = set()
try:
"""Creating counter"""
counter = np.zeros(len(values), np.uint)
"""Create a mask for excluded values"""
"""https://stackoverflow.com/questions/25330959/how-to-select-inverse-of-indexes-of-a-numpy-array"""
mask = np.ones(len(np.array(values)), np.bool)
mask[exclude] = False
"""available values to create combinations"""
values_a = set(values) - set(exclude)
values_a = list(values_a)
if length == 1:
if mode == 'total':
"""generate just data_number of examples"""
for ii in range(limit):
comb = random.sample(values_a, 1)[0]
del values_a[values_a.index(comb)]
done.add(tuple([comb]))
else:
"""generate one example for each comb"""
for comb in values_a: done.add(tuple([comb]))
else:
"""total number of combinations"""
if isinstance(length, str): rr = np.mean([min_, max_])
else: rr = length
nn = len(values_a)
comb_max = int(Decimal(np.math.factorial(nn)) / Decimal(np.math.factorial(rr) * np.math.factorial(nn-rr)))
err_limit = int(comb_max * 0.01)
if err_limit > 10000: err_limit = 10000
"""initiate variables"""
#should itertools be used to generate the rest of combinations
gen_comb = False
#has all combinations generated by itertools ended
comb_left_0 = False
#has the limit of errors been reached to generate itertools combinations
err_limit_reached = False
#previous combination
ll_prev = 0
dd = 0 #done counter
comb_left = set() #itertools combinations
err = 0 #errors counter
"""options variables for statistics"""
err_t = 0 #total number of errors
gen = 0 #total number of generations of itertools.combinations
ii = 0 #total number of iterations
print('GENERATING LIST OF COMBINATIONS')
while True:
"""print progress"""
ii = ii + 1
if not dd % 1000: print('.', end='')
"""check if length of combs is random or not"""
if isinstance(length, str):
"""change max_ length of combinations to
\the length of available values"""
if len(values_a) < max_: max_ = len(values_a)
ll = random.randint(min_, max_)
else: ll = length
if ll != ll_prev: gen_comb = True
"""generate combinations only when err limit is reached or
the length of combinations has changed"""
if err_limit_reached and gen_comb:
gen = gen + 1
"""after reaching the max number of consecutive errors, start generating combinations via itertools"""
"""generation is done at this point to prevent generation for a very long list"""
"""generation is also done when when length of a combination changes"""
comb_left = set(itertools.combinations(values_a, ll)) - done
"""break if there are no elements left"""
if not len(comb_left): break
"""if combinations has already been generated, used them"""
if comb_left:
"""take random sample from the set"""
comb = random.sample(comb_left, 1)[0]
"""remove it from the set"""
comb_left.remove(comb)
"""check if it was the last combination to break the loop at the end"""
if not len(comb_left): comb_left_0 = True
else:
"""generate random combination"""
comb = tuple(sorted(random.sample(values_a, ll)))
"""set previous length"""
ll_prev = ll
"""reset gen_comb"""
gen_comb = False
"""check if combination is new"""
if comb not in done: found = True
else:
"""otherwise, iterate errors"""
err = err + 1
err_t = err_t + 1
found = False
if err > err_limit: err_limit_reached = gen_comb = True
if found:
"""reset err"""
err = 0
dd = dd + 1
"""add combination to done"""
done.add(comb)
"""increase counter for the combs"""
counter[list(comb)] = counter[list(comb)] + 1
"""check if seekeing the max number of combinations or min"""
if mode == 'max':
"""for max, we must remove the elements which reached the limit"""
for l_ in list(comb):
if counter[l_] == limit:
del values_a[values_a.index(l_)]
"""if length of available elements is smaller than the possible length of the combinations"""
if isinstance(length, str):
"""for random length, choose the minimal length"""
if len(values_a) < min_: break
else:
if len(values_a) < ll: break
"""if all elements reached the limit"""
if mode == 'total':
if len(done) >= limit: break
else: #min, max
if all(counter[mask] >= limit): break
"""if the number of consecutive errors reached
the total number of combinations, break as you may never
draw a valid combination"""
if err > comb_max: break
"""if it was the last combination left"""
if comb_left_0: break
except Exception as e: print(e)
print('')
print('Combinations generated: ' + str(dd))
print('Total number of iterations: ' + str(ii))
print('Final value of err: ' + str(err))
print('Total number of errors: ' + str(err_t))
print('How many times itertools.combinations was used: ' + str(gen))
return done
"""range of values to create combinations"""
values = range(256)
"""values excluded from the combinations"""
exclude = [0,255]
"""length of combinations, if is string, the number of layers
is generated randomly withing (min_, max_) range """
length = 'r'
"""mode of how the combinations are generated:
min: minimal number of times the value appears across all combinations
(limited down by the limit value)
max: max number of times the value appears across all combinations (limited
max by the limit value)
total: total number of combinations (limited the limit value)"""
mode = 'max'
"""limit used for the mode combinations' generation"""
limit = 1000
"""min_ and max_ are used when length is string,
length is generated randomly within (min_, max_) range"""
min_ = 4
max_ = 12
done = get_random_combinations(values, exclude, length, mode, limit, min_, max_)

TA贡献1995条经验 获得超2个赞
这是另一个关注随机方面的答案。
您已经n
选择k
了要随机采样的可能性,以获得z
每个值的近似出现次数(使用我在另一个答案中的符号)。我假设如果您采用(n * z) // k
k
-size 的样本,并且您的随机数生成器实际上是统一的,您将自动获得z
每个元素的近似出现次数。在您的示例中,使用n=256
和k=3
,z=100
在 8533 中,256 个 bin 之间的分布确实是相当均匀的,这似乎是合理的。
如果您愿意接受某种程度的一致性缺陷,pythonrandom.sample
是一个不错的选择。人口是从零开始n
选择的所有整数k
。
n
k
在这种情况下选择是256 * 255 * 254 / 6 = 2763520
。这超出了有符号 32 位整数的范围,但很适合无符号整数。更好的是,您可以简单地使用 Python 的无限精度整数。
诀窍是将这些数字映射到唯一的值组合。这是使用组合数系统完成的,如此处所述。
from random import sample
from scipy.misc import combs
def gen_samples(n, k, z):
codes = sample(range(combs(n, k)), (n * z) // k)
return [unrank(n, k, code) for code in codes]
def unrank(n, k, i):
"""
Implementation of Lehmer's greedy algorithm left as
exercise for the reader
"""
# return k-element sequence
有关取消排名的提示,请参见此处。

TA贡献1806条经验 获得超5个赞
总共有多种n
选择k
可能的组合符合您的标准,其中n = length
和k = com_len
。n choose k
评估为n! / (k! * (n - k)!)
。如果您生成所有不同的可能性,则每个n
值都会出现(n - 1)! / ((k - 1)! * (n - k)!)
多次(https://math.stackexchange.com/q/26619/295281)。你应该能够解决这个问题,假设z <= (n - 1)! / ((k - 1)! * (n - k)!)
, where z = counter_expected
。
对于您的示例:
n = 256
k = 3
z = 100 <= 32385
通常,生成组合的一种常用方法是k
通过长度为的布尔数组n
逐步增加位,始终递增可能的最低位。每当更高的位增加时,它下面的所有位都会重置为其初始位置。这是一个示例序列:
0 0 0 0 3 2 1
0 0 0 3 0 2 1
0 0 0 3 2 0 1
0 0 0 3 2 1 0
0 0 3 0 0 2 1
...
3 2 0 0 1 0 0
3 2 0 1 0 0 0
3 2 1 0 0 0 0
我已经对位置进行了标记,以便您可以看到,如果值开始排序,则组合将始终排序。n
请记住,您可以将其实现为布尔值或k
索引数组。两者都有优点和缺点。
对于您的特定用例,有一个转折点。一旦计数超过一定数量,您就不要使用一点。有许多方法可以逐步遍历这些位,但它们都归结为拥有一个大小n
计数器数组。
如果n * z
是 的倍数k
,您将能够自动获得所有垃圾箱中的准确计数。实际上,它们本身都n
不必z
是 的倍数k
。但是,如果这不是真的,您将不可避免地出现下溢或上溢。直观地说,您希望一次生成一个n * z
总值目标k
。很明显,一个必须是后者的倍数才能使这成为可能。
您可以有两种类型的退出标准。给定所有位的总累积计数s
,
s >= n * z
: 所有位的计数至少为z
. 最多k - 1
位的计数为z + 1
.s > n * z - k
:所有位的计数为z
,最多k - 1
位除外,因此再添加一个组合将导致条件 1。
最后要讨论的一种设计选择是位移动的顺序。由于生成一系列组合会耗尽一个垃圾箱,我希望能够以可预测的顺序在数组存储桶的一侧保持耗尽的垃圾箱按顺序累积。这将从算法中删除很多检查。因此,我不会增加可能的最低位,而是增加可能的最高位,并在它重置时增加它下面的一位。在这种情况下,用尽的桶将始终是最低位。
因此,让我们最终停止制作未经证实的听起来很数学的陈述,并展示一个实现:
def generate_combos(n, k, z):
full_locs = np.arange(k + 1, dtype=np.uint)
full_locs[k] = n # makes partial vectorization easier
locs = full_locs[:k] # bit indices
counts = np.zeros(n, dtype=np.uint) # counter buckets
values = np.arange(n, dtype=np.uint) # values
min_index = 0 # index of lowest non-exhausted bin
for _ in range((n * z) // k):
counts[locs] += 1
yield values[locs]
if counts[min_index] == z:
# if lowest bin filled, shift and reset
min_index += np.argmax(counts[min_index:] < z)
locs[:] = min_index + np.arange(k)
else:
# otherwise, increment highest available counter
i = np.flatnonzero(np.diff(full_locs) > 1)
if i.size:
i = i[-1]
locs[i] += 1
# reset the remainder
locs[i + 1:] = locs[i] + np.arange(1, k - i)
else:
break
这使用条件 2。如果您需要条件 1,请在后面添加以下行:
if counters[-1] < z:
yield values[-k:]
将循环更改为类似for _ in range(-((n * z) // -k)): (由https://stackoverflow.com/a/54585138/2988730提供)将无济于事,因为计数器并非旨在处理它。
这是一个IDEOne 链接,显示了 的前一百个元素generate_combos(256, 3, 10):
[0 1 2]
[0 1 3]
[0 1 4]
[0 1 5]
[0 1 6]
[0 1 7]
[0 1 8]
[0 1 9]
[ 0 1 10]
[ 0 1 11]
[2 3 4]
[2 3 5]
[2 3 6]
[2 3 7]
[2 3 8]
[2 3 9]
[ 2 3 10]
[ 2 3 11]
[ 2 3 12]
[4 5 6]
[4 5 7]
[4 5 8]
[4 5 9]
[ 4 5 10]
[ 4 5 11]
[ 4 5 12]
[ 4 5 13]
[6 7 8]
[6 7 9]
[ 6 7 10]
[ 6 7 11]
[ 6 7 12]
[ 6 7 13]
[ 6 7 14]
[ 8 9 10]
[ 8 9 11]
[ 8 9 12]
[ 8 9 13]
[ 8 9 14]
[ 8 9 15]
[10 11 12]
[10 11 13]
[10 11 14]
[10 11 15]
[10 11 16]
[12 13 14]
[12 13 15]
[12 13 16]
[12 13 17]
[12 13 18]
[13 14 15]
[14 15 16]
[14 15 17]
[14 15 18]
[14 15 19]
[14 15 20]
[15 16 17]
[16 17 18]
[16 17 19]
[16 17 20]
[16 17 21]
[16 17 22]
[16 17 23]
[17 18 19]
[18 19 20]
[18 19 21]
[18 19 22]
[18 19 23]
[18 19 24]
[18 19 25]
[19 20 21]
[20 21 22]
[20 21 23]
[20 21 24]
[20 21 25]
[20 21 26]
[20 21 27]
[21 22 23]
[22 23 24]
[22 23 25]
[22 23 26]
[22 23 27]
[22 23 28]
[22 23 29]
[24 25 26]
[24 25 27]
[24 25 28]
[24 25 29]
[24 25 30]
[24 25 31]
[24 25 32]
[26 27 28]
[26 27 29]
[26 27 30]
[26 27 31]
[26 27 32]
[26 27 33]
[26 27 34]
[28 29 30]
[28 29 31]
...
请注意,在前 10 个元素之后,两者0都1出现了 10 次。2并且3出现过一次,所以它们只在 9 次迭代后就用完了,以此类推。
添加回答
举报