
    ;i                     p    d Z ddlZddlZddlmZmZmZmZ ddl	Z	 e	j                  e      Z G d d      Zy)z7Pattern sampling strategies for diverse recommendation.    N)ListDictAnyOptionalc                      e Zd ZdZe	 	 ddeeeef      dee	   de
dedeeeef      f
d       Zedeeeef      dee	   de
deeeef      fd       Zedeeeef      dee	   de
deeeef      fd	       Zedeeeef      dee	   de
deeeef      fd
       Zedeeeef      dee	   de
deeeef      fd       Zy)DiversitySamplerzGSampler for diverse pattern selection to ensure balanced training data.patternsscorestop_kstrategyreturnc                    t        |       t        |      k7  r#t        dt        |        dt        |             t        |       |k  r| S |dk(  rt        j                  | ||      S |dk(  rt        j	                  | ||      S |dk(  rt        j                  | ||      S |dk(  rt        j                  | ||      S t        j                  d| d       t        j	                  | ||      S )	ay  Sample diverse patterns to ensure balanced training data.
        
        Args:
            patterns: List of pattern dictionaries
            scores: List of scores for each pattern
            top_k: Number of patterns to select
            strategy: Sampling strategy
                - "top": Select top-k highest scores (original)
                - "mixed": Mix of top, middle, and random
                - "stratified": Stratified sampling by score ranges
                - "uncertainty": Select patterns with high uncertainty (high variance)
        
        Returns:
            List of selected pattern dictionaries
        z%Patterns and scores length mismatch: z vs topmixed
stratifieduncertaintyzUnknown strategy z, using 'mixed')	len
ValueErrorr   _sample_top_sample_mixed_sample_stratified_sample_uncertaintyloggerwarning)r	   r
   r   r   s       I   /home/ubuntu/codebase/yexijia/保研/colocation_mvp/experiment/sampler.pysample_diverse_patternsz(DiversitySampler.sample_diverse_patterns   s    , x=CK'DS]OSWX[\bXcWdeffx=E!Ou#//&%HH #11(FEJJ%#66xOO&#77&%PPNN.xjHI#11(FEJJ    c                 p    t        t        | |      d d      }|d| D cg c]  \  }}|	 c}}S c c}}w )z/Original strategy: select top-k highest scores.c                     | d   S N    xs    r   <lambda>z.DiversitySampler._sample_top.<locals>.<lambda>9       QqTr   TkeyreverseN)sortedzip)r	   r
   r   rankedp_s         r   r   zDiversitySampler._sample_top6   s;     Hf->4P$Ven-ndan---s   2c           	         t        |       }t        t        | |      d d      }t        dt	        |dz              }t        dt	        |dz              }||z
  |z
  }g }t               }	t        |      D ]*  }
|j                  ||
   d          |	j                  |
       , |dz  }d	|z  dz  }t        t        |t        ||                  }|D 
cg c]	  }
|
|	vs|
 }}
|rXt        j                  |t        |t        |                  }|D ]*  }|j                  ||   d          |	j                  |       , t        |      D 
cg c]	  }
|
|	vs|
 }}
|rGt        j                  |t        |t        |                  }|D ]  }|j                  ||   d           t        j                  |       t        j                  d
| dt        |      |z
  |z
   d| d       |S c c}
w c c}
w )zMixed strategy: top + middle + random.
        
        Distribution:
        - Top 40%: Highest scores (likely positive)
        - Middle 30%: Medium scores (uncertain)
        - Random 30%: Random selection (exploration)
        c                     | d   S r    r"   r#   s    r   r%   z0DiversitySampler._sample_mixed.<locals>.<lambda>F   r&   r   Tr'   r!   g?g333333?r         zMixed sampling: z top + z
 middle + z random)r   r*   r+   maxintsetrangeappendaddlistminrandomsampleshuffler   info)r	   r
   r   nr,   n_topn_middlen_randomselectedselected_indicesimiddle_start
middle_endmiddle_candidatessampled_middleidx	remainingsampled_randoms                     r   r   zDiversitySampler._sample_mixed<   s    MHf->4PAs53;'(q#eck*+5=8+5 uAOOF1IaL)  # 
 AvUaZ
 |SQ5G!HI(9W(91QFV=VQ(9W#]]+<c(CPaLb>cdN%sA/ $$S) &
 !&aF1A5E,EQ	F#]]9c(C	N6STN%sA/ & 	x &ugWS]U5JX5U4VV`ai`jjqrs' X Gs   	G&G&	G+G+c           
      (   t        |       }t        t        | |      d d      }d}t        d||z        }|||z  z
  }g }||z  }	t	        |      D ]  }
|
|	z  }|
|dz
  k  r|
dz   |	z  n|}|
|dz
  k(  r|}t        t	        ||            }||
|dz
  k(  r|ndz   }|sKt        j                  |t        |t        |                  }|D ]  }|j                  ||   d            t        j                  |       t        j                  d| d       |S )	zIStratified sampling: divide score range into strata and sample from each.c                     | d   S r    r"   r#   s    r   r%   z5DiversitySampler._sample_stratified.<locals>.<lambda>q   r&   r   Tr'   r2   r!   r   zStratified sampling: z per stratum)r   r*   r+   r3   r6   r9   r;   r<   r:   r7   r=   r   r>   )r	   r
   r   r?   r,   n_stratasamples_per_stratum	remainderrC   stratum_sizestratumstartendstratum_candidates	n_samplessampledrJ   s                    r   r   z#DiversitySampler._sample_stratifiedm   s4    MHf->4P !!Uh%67/(::	H}XGl*E29HqL2H7Q;,.aC (Q,&!%eE3&7!8+GxRS|<SyYZ[I! --(:C	3OaKb<cd"COOF3KN3 # '  	x +,?+@MNr   c                     t        j                  |      }t        j                  |      }t        j                  ||z
        }t        j                  |      d| }|D cg c]  }| |   	 }}t
        j                  d| d|d       |S c c}w )zUncertainty-based sampling: select patterns near decision boundary.
        
        This selects patterns with scores close to the median (uncertain predictions).
        NzUncertainty sampling: selected z patterns near median score z.4f)nparraymedianabsargsortr   r>   )	r	   r
   r   scores_arraymedian_score	distancesuncertain_indicesrE   rC   s	            r   r   z$DiversitySampler._sample_uncertainty   s     xx'yy. FF<,67	 JJy1&59):;):AHQK):;5eW<XYefiXjkl <s   BN)   r   )__name__
__module____qualname____doc__staticmethodr   r   strr   floatr4   r   r   r   r   r   r"   r   r   r   r      s   Q 	%KtCH~&%KU%K %K 	%K
 
d38n	%K %KN .d4S>2 .DK .PS .X\]abegjbj]kXl . .
 .T#s(^ 4 .d5k .RU .Z^_cdgildl_mZn . .` T$sCx.%9 4; WZ _cdhilnqiqdr_s  B d4S>&: DK X[ `deijmorjres`t  r   r   )rg   numpyrZ   r;   typingr   r   r   r   logging	getLoggerrd   r   r   r"   r   r   <module>ro      s5    =   , , 			8	$U Ur   