写代码实现,对长度为N (N非常大,不能一次放入内存)的乱序整数集合,获取它的Top K(可以放到内存)个元素。
思路
在内存中维护(K)个元素的小顶堆,遍历一遍N,每个元素判断是否大于堆顶元素,大于则替换堆顶,向下调整,保持小顶堆。重复上述步骤,最后堆中留下的就是top N。
代码
相关知识
小顶堆
小顶堆是一种经过排序的完全二叉树,其中任一根节点的数据值均不大于其左子节点和右子节点的值。则其根(堆顶)存储的就是整个堆中最小的值。

小顶堆建堆代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| public class LitHeapKest {
static int []a = new int[]{9,8,7,6,5,4,3,2,1,0}; public static void main(String[] args) { for (int i = 1;i<a.length;i++){ addj(i,a[i]); } print(); }
private static void addj(int k,int n){
int len = a.length ;
while(k<len && (k-1) >= 0){ int parent = (k-1)/2; if( a[parent] <= n ){ break; }
a[k] = a[parent]; k = parent;
} a[k] = n ; }
private static void print(){ for (int i = 0;i<a.length;i++){ System.out.print(a[i]+" "); } } }
|
替换堆顶元素后向下调整为小顶堆
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| public class LitHeapKest {
static int []a = new int[]{9,8,7,6,4,3,2,1,0}; public static void main(String[] args) { for (int i = 1;i<a.length;i++){ addj(i,a[i]); } print(); swap(5); System.out.println(); print(); }
private static void swap(int n) { if (a[0] < n) { a[0] = n; }
addjdown(); }
private static void addjdown(){ int pos = 0 ; int len = a.length; int sub = pos*2+1;
while (sub < len){ if(sub+1 <len && a[sub] > a[sub+1] ){ sub ++ ; }
if(a[pos] < a[sub]){ break; }
int tmp = a[pos]; a[pos] = a[sub]; a[sub] = tmp;
pos = sub; sub = pos*2+1; } }
private static void addj(int k,int n){
int len = a.length ;
while(k<len && (k-1) >= 0){ int parent = (k-1)/2; if( a[parent] <= n ){ break; }
a[k] = a[parent]; k = parent;
} a[k] = n ; }
private static void print(){ for (int i = 0;i<a.length;i++){ System.out.print(a[i]+" "); } } }
|
代码实现
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
| public class TopKFromN {
int k = 5; int m = 13; int n = 102;
int[] pool = new int[n];
public TopKFromN() { initPool(); }
private void initPool() {
for (int i = 0; i < n; i++) { pool[i] = RandomUtils.nextInt(0, n + 1); } }
private int[] getPerBatch(int batch) {
int startPos = (batch - 1) * m; int endPos = (batch) * m; if (endPos > n) { endPos = n; } int length = endPos - startPos; int b[] = new int[length];
System.arraycopy(pool, startPos, b, 0, length); return b; }
private int[] getTopK() {
int topK[] = new int[k]; boolean initHeap = false;
int batchSize = (n - 1) / m + 1;
for (int x = 1; x <= batchSize; x++) { int startPos = 0 ; int b[] = getPerBatch(x);
for (int pos = 0 ;pos<b.length;pos++){ if(!initHeap){ add(topK,b[pos],pos); if(pos == k-1){ initHeap = true ; } }else { if(topK[0] < b[pos]){ swap(topK,b[pos]); } } } } return topK; }
private void add(int []k,final int n,final int p){ if(p == 0){ k[0] = n; return ; }
int pos = p; while(pos>0){ int parent = (pos-1)>>>1; if(k[parent] <= n){ break; } k[pos] = k[parent]; pos = parent; } k[pos] = n; }
private void swap(int []k,int n){ int pos = 0 ; int sub = pos*2+1;
while (sub < k.length){
if((sub+1)<k.length && k[sub] > k[sub+1]){ sub ++ ; } if(n < k[sub]){ break; } k[pos] = k[sub]; pos = sub; sub = pos*2+1; } k[pos] = n; }
public static void main(String[] args) { TopKFromN topKFromN = new TopKFromN();
int[] _k = topKFromN.getTopK(); printArray(_k); }
private static void printArray(int[] p){ for (int e:p){ System.out.print(e +"\t"); } System.out.println(); }
|
参考
Java堆结构PriorityQueue完全解析
https://blog.csdn.net/u013309870/article/details/71189189
从简单选择排序到堆排序的深度解析
https://blog.csdn.net/touch_2011/article/details/6767673
海量数据中找出topK
https://blog.csdn.net/suibianshen2012/article/details/52003082
TopK算法
https://blog.csdn.net/dingpiao190/article/details/73604718