
    qi                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZmZ d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZ d dlZd dl Zd dl!m"Z" d dl#m$Z$ d d	l%m&Z& d d
l'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 erd dl7m8Z8 d dl9m:Z:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZA dZB e4eCd      ZD G d deE      ZF G d d      ZG G d d      ZHee&j                  e&j                  f   ZKej                   G d d             ZMej                   G d d             ZN G d  d!eN      ZO G d" d#      ZP G d$ d%      ZQ G d& d'eN      ZR G d( d)ePeR      ZS G d* d+eQeR      ZT G d, d-eN      ZU G d. d/ePeU      ZV G d0 d1eQeU      ZW G d2 d3ePeN      ZX G d4 d5eQeN      ZY G d6 d7ePeN      ZZej                  d:d8       Z\	 	 	 	 d;d9Z]y)<    )annotationsN)CallableIterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyIOOptionalTYPE_CHECKINGUnion)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)do_bench_using_profilingget_gpu_typeget_ld_library_pathis_gpupython_subprocess_env)getArtifactLogger)
OrderedSet)
ModuleType)PartialRenderTritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      e Zd Zy)!NonzeroWorkspaceNotSupportedErrorN__name__
__module____qualname__     f/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/autotune_process.pyr*   r*   <       r0   r*   c                      e Zd ZdZedd       Ze	 d	 	 	 	 	 	 	 dd       Zedd       ZddZd Z	ddZ
ddd	Zddd
ZdddZddZddZddZddZy)TuningProcesszF
    Class to launch and interact with a benchmarking subprocess.
    c                     t         j                  dt        j                         t        j                  j                  t                      fd}	  |        y# t        $ r Y yw xY w)z4
        Entry point for the child process.
        z3Started autotune subprocess %s. Visible devices: %sc                     	 t         j                        \  } }| y 	 |rt        j                  j	                  |        |        }t         j                  |       [# t
        $ r}|}Y d }~'d }~ww xY wN)r4   recvosenvironupdate	Exceptionsend)job	extra_envresulte	read_pipe
write_pipes       r1   workloopz,TuningProcess.process_main.<locals>.workloopP   sq    !.!3!3I!>Y; 

)))4 UF ""6:6  ! Fs   (A 	A3'A..A3N)autotuning_logdebugr9   getpidr:   getr'   EOFError)rB   rC   rD   s   `` r1   process_mainzTuningProcess.process_mainE   sQ    
 	AIIKJJNN/0	
	7	J 		s   A 	A('A(Nc                T    t        j                  | |f|       |j                          y r7   )pickledumpflush)objrC   r?   s      r1   r=   zTuningProcess.sendd   s#     	S)$j1r0   c                ,    t        j                  |       S r7   )rL   load)rB   s    r1   r8   zTuningProcess.recvk   s    {{9%%r0   c                2    || _         | j                          y r7   )devicestart)selfrS   s     r1   __init__zTuningProcess.__init__o   s    

r0   c                   t         j                  j                  t         j                  j                  t              d      }t        j
                         \  }}t        j
                         \  }}t        j                  |d      | _        t        j                  |d      | _        t        j                         | _        | j                  j                  | j                  t        j                         t        j                  |dt        j                           dt#        |       dt#        |       g}i t%               dt'               t(        j*                  rdndd	}| j,                  t#        | j,                        |t.        <   t1        j2                  ||||f      | _        t        j6                  |       t        j6                  |       d| _        y
)z4
        Start the benchmarking subprocess.
        z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=01)TORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)r9   pathjoindirname__file__pipefdopenrC   rB   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerG   strr   r   r$   /profile_bandwidth_with_do_bench_using_profilingrS   r'   
subprocessPopenprocesscloserunning)rU   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmdr_   s           r1   rT   zTuningProcess.starts   sw    RWW__X68NO$&GGI!$&GGI!!))Hd37D1!113t~~y/C/CD NN		}%_-./#./01

#%
  #24 EE DG
 ;;"(+DKK(8C$%!''%'78

 	!
!"r0   c                V    | j                   xr | j                  j                         du S )z:
        True if the subprocess is still running.
        N)rt   rr   pollrU   s    r1   alivezTuningProcess.alive   s%     ||; 1 1 3t ;;r0   c                    | j                         s| j                          t        j                  || j                  |       y)z8
        Push a work item to the child process.
        r?   N)r~   rT   r4   r=   rC   )rU   reqr?   s      r1   putzTuningProcess.put   s/     zz|JJL39Er0   c                   	 | j                   j                  |      s"t        d| j                  j                         t
        j                  | j                        \  }}t        |t              r||S # t        $ r | j                           t        $ r | j                           t        $ r< t        j                  d| j                  j                         | j                           w xY w)z
        Get a response from the child process. Raises TimeoutError on timeout;
        raises EOFError if the subprocess crashes.
        zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)ri   selectTimeoutErrorrr   pidr4   r8   rB   killrI   rs   r<   rE   	exception
isinstance)rU   timeoutr@   _s       r1   rH   zTuningProcess.get   s    
	==''0"%DT\\EUEUDV#WXX%**4>>:IFA fi(L!  	IIK 	JJL 	$$@$,,BRBR IIK	s   AA5 5A7C,c                    | j                         r t        j                  d| j                         |r| j	                          yy)zC
        Signal the child process to shut down gracefully.
        N)r~   r4   r=   rC   wait)rU   r   s     r1   shutdownzTuningProcess.shutdown   s2     ::<tT__5IIK r0   c                x    | j                         r| j                  j                          | j                          y)z5
        Wait for the child process to exit.
        N)r~   rr   r   rs   r}   s    r1   r   zTuningProcess.wait   s&     ::<LL

r0   c                    | j                   j                          | j                  j                          | j                  j                          d| _        y)z"
        Close resources.
        FN)ri   rs   rB   rC   rt   r}   s    r1   rs   zTuningProcess.close   s;     	r0   c                    | j                         rDt        j                  d| j                  j                         | j                  j                          | j                          y)z6
        Send a SIGKILL to the child process.
        z)Sending SIGKILL to autotune subprocess %dN)r~   rE   errorrr   r   r   rs   r}   s    r1   r   zTuningProcess.kill   sF     ::<  ;   LL

r0   c                H    | j                  d       | j                          y)z8
        Gracefully restarts the child process.
        Tr   N)r   rT   r}   s    r1   restartzTuningProcess.restart   s     	4 

r0   )rB   	IO[bytes]rC   r   returnNoner7   )rO   r   rC   r   r?   dict[str, str] | Noner   r   )rB   r   r   r   )rS   Optional[int])r   bool)r   r   r?   r   r   r   )g      ^@)r   floatr   r   )T)r   r   r   r   r   r   )r,   r-   r.   __doc__staticmethodrJ   r=   r8   rV   rT   r~   r   rH   r   r   rs   r   r   r/   r0   r1   r4   r4   @   s      < LP'4I	  & &+Z<F6
r0   r4   c                  J    e Zd ZdZddZed	d       ZddZd
dZ	 	 	 	 ddZ	y)TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    c                V   | j                         }t        j                  d|       |D cg c]  }t        |       c}| _        t        j                         | _        | j                  D ]  }| j                  j                  |        t        t        |            | _        yc c}w )z,
        Start the child processes.
        z$Sub-process autotune device list: %s)rS   )max_workersN)get_device_listrE   rF   r4   	processesqueueQueueprocess_queuer   r   lenexecutor)rU   devicesrS   ps       r1   rV   zTuningProcessPool.__init__   s     &&(CWM FMM6-v6M9> 	&A""1%	& +s7|D Ns   B&c                 l   t         j                  sdgS t               } t        |       }|j	                         }t
        t        j                  v rNt        j                  t
           j                  d      D cg c]  }t        |       }}t        |      |k  sJ |S t        t        |            S c c}w )zD
        Gather the list of devices to be used in the pool.
        N,)r$   autotune_multi_devicer   r   device_countr'   r9   r:   splitintr   listrange)gpu_typedevice_interfacecountdr   s        r1   r   z!TuningProcessPool.get_device_list  s    
 ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS!s1vSGSw<5(((NE%L!!	 Ts   7B1c                    | j                   j                          | j                  D ]  }|j                  d        | j                  D ]  }|j                           y)z5
        Signal all child processes to exit.
        Fr   N)r   r   r   r   )rU   r   s     r1   r   zTuningProcessPool.shutdown&  sQ     	  	#AJJEJ"	# 	AFFH	r0   c                F   |j                   J ddg}|D ci c])  }|t        j                  v s|t        j                  |   + }}| j                  j	                         }|j                  |j                   j                  |       	 |j	                  t        j                        | j                  j                  |       S c c}w # t        $ rB t        j                  d| d       t        d      cY | j                  j                  |       S t        $ rf}t        j                  d| d       d	t        |      v r|j                          t        d      cY d}~| j                  j                  |       S d}~ww xY w# | j                  j                  |       w xY w)
z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        NTORCHINDUCTOR_CACHE_DIRTRITON_CACHE_DIRr   zTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice 'cudaErrorLaunchFailure)bmreqr9   r:   r   rH   r   	benchmarkr$   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   r<   rn   r   )rU   choiceenv_varsvr?   rr   process_exceptions          r1   targetzTuningProcessPool.target1  sx    ||'''-/AB/7K!1

?Q

1%K	K$$((*FLL**i@	,;;BB. ""7+7 L  	 MM1& :W W
 < ""7+  	 MM.vh 7W W (3/@+AA!<""7+	  ""7+sG   CC	C -F 5F F A E;F F ;F  F F c           	     x    t        t        || j                  j                  | j                  |                  }|S )z>
        Benchmark each choice in a separate process.
        )dictzipr   mapr   )rU   choicesresultss      r1   r   zTuningProcessPool.benchmarkW  s/     s7DMM$5$5dkk7$KLMr0   Nr   )r   zSequence[Optional[int]])r   r"   r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])
r,   r-   r.   r   rV   r   r   r   r   r   r/   r0   r1   r   r      sC    E& " "(	$,L+ 
+r0   r   c                  p    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   dZded<   e	 	 	 	 dd       ZddZy)
TensorMetaztorch.devicerS   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec           
     :   t        |t              r4|D cg c]  }| j                  |       }}t        d |D              sJ |S |}t        |t        j
                        rt	        j                  d|      }|j                         }|J |j                         }|J t        ||t        j                  j                  j                  |j                         t        j                         t        j                  j                  j                  |j#                         t        j                         t        j                  j                  j%                  |j'                         j(                  t        j                         |j+                               S c c}w )Nc              3  <   K   | ]  }t        |t                y wr7   )r   r   .0xs     r1   	<genexpr>z*TensorMeta.from_irnodes.<locals>.<genexpr>x  s     AQz!Z0A   fake)r   layout)fallback)rS   r   r   r   r   r   )r   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r&   graphsizevars
size_hintsget_sizer$   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)clsirnodesr   r@   noder   rS   s          r1   r   zTensorMeta.from_irnodesr  sU    gx(>E F!1!1!!4 FF FA&AAAAMdBII&99&6D    "!!!''""--88 .  GG$$//!88 0  77##--!((88 .  
 	
 !Gs   Fc                    t        | j                  | j                  | j                  | j                  | j
                        S )N)rS   r   
extra_size)r   r   r   rS   r   r   r}   s    r1   	to_tensorzTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r0   )r   z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r   #Union[TensorMeta, list[TensorMeta]])r   torch.Tensor)r,   r-   r.   __annotations__r   classmethodr   r   r/   r0   r1   r   r   i  sQ    ((++KD-!
E!
	,!
 !
F
r0   r   c                  x    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d	dZ	 	 	 	 	 	 d
dZddZdd	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)BenchmarkRequesta1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    c                   || _         t        |t              r	|g| _        n|| _        rLt        t        t
        f      r6t              dkD  rt        fdD              sJ d   | _        || _	        y | _        || _	        y )Nr#   c              3  d   K   | ]'  }d D ]   }t        d   |      t        ||      k(   " ) yw))rS   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r1   r   z,BenchmarkRequest.__init__.<locals>.<genexpr>  sG       Q  .q148GAt<LLLs   -0r   )
kernel_namer   r   input_tensor_metatupler   r   r   r   
extra_args)rU   r  r  r   r  s      ` r1   rV   zBenchmarkRequest.__init__  s     ''48I7JD"7HD"*-?%"O%&* /   
 '9&;D# % 3ED#$r0   c                   t         r7   NotImplementedErrorrU   outinput_tensorss      r1   make_run_fnzBenchmarkRequest.make_run_fn  s
     "!r0   c                     y r7   r/   r}   s    r1   cleanup_run_fnzBenchmarkRequest.cleanup_run_fn  s    r0   Nr	  c                   t         r7   r  rU   fnr	  r
  s       r1   do_benchzBenchmarkRequest.do_bench  s
     "!r0   c               >   t         j                  t        j                        }|rt	        j                         }|e| j
                  r| j                  sJ d       t        |      dk(  sJ t        d | j
                  D              }| j                  j                         }|r+t	        j                         z
  }t	        j                         }	  | j                  |d|i}|r+t	        j                         z
  }t	        j                         } | j                  |g|| }|r9t	        j                         z
  }	t         j                  dt!        |       |	       | j#                          |S # t        $ r# t         j                  d       t        d      cY S w xY w)NzJInput and output tensor meta must be populated when input_tensors is emptyr   c              3  <   K   | ]  }|j                           y wr7   )r   r   s     r1   r   z-BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !PA!++-!Pr   r	  z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rE   isEnabledForloggingDEBUGtimer  r   r   r  r   r  r*   infor   r  rF   rn   r  )
rU   r	  r
  rF   start_tscreate_tensor_elapser  load_elapseresbench_elapses
             r1   r   zBenchmarkRequest.benchmark  ss   
 ++GMM:yy{H ;))d.E.E \E }%***!!P9O9O!PPM))335C#'99;#9 yy{H	 !!!=:c:B ))+0Kyy{HdmmB44499;1L  HD	$ 	
+ 1 	  RS<	 s   E0 0)FF)
r  rn   r  r   r   r   r  Iterable[Any]r   r   r
  r   r	  r   r   zCallable[[], None]r   r
  r   r	  Optional[torch.Tensor]r   r   )	r,   r-   r.   r   rV   r  r  r  r   r/   r0   r1   r   r     s    %% ?% @	%
 "% 
%:"*"1="	"
 '+	" %" $	"
 
" '+,$, $, 
	,r0   r   c                  N    e Zd ZdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)	_TestBenchmarkRequestz
    Supports unit testing. Defined in this file instead of the test file so the
    TuningProcess sub-process can unpickle these objects.
    Nc                J    || _         || _        || _        || _        || _        y r7   )r@   rS   sleepexccrash)rU   r@   rS   r&  r'  r(  s         r1   rV   z_TestBenchmarkRequest.__init__  s'     

r0   r  c               r   | j                   <t        j                  j                  t        d       t        | j                         k(  sJ | j                  rt        j                  | j                         | j                  r| j                  | j                  rt        j                  d       | j                  S )Nr#   )rS   r9   r:   rH   r'   rn   r&  r  r'  r(  rl   exitr@   r  s      r1   r   z_TestBenchmarkRequest.benchmark  sx     ;;"::>>"6=T[[AQQQQ::JJtzz"88((N::HHQK{{r0   )        NNNF)
r@   r   rS   r   r&  zOptional[float]r'  zOptional[Exception]r(  r   r!  )r,   r-   r.   r   rV   r   r/   r0   r1   r$  r$    sq      $!%#'  	
 !  KO*1G	r0   r$  c                  $    e Zd Zdd	 	 	 	 	 ddZy)GPUDeviceBenchmarkMixinNr  c                  t        d g ||D              }t        |      dk  s
J d|        t        d |D        d      }t        |      }t        |      dk(  rt        t	        |            }n|j                         }|j                  |      5  t        j                  |      }|j                          d d d        |S # 1 sw Y   S xY w)Nc              3     K   | ]i  }t        |t        j                        rMt        |j                  j
                        r.|j                  j                  |j                  j                   k y wr7   )r   torchTensorr   rS   typeindexr   tensors     r1   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>/  sR      $
&%,,/v}}))*##/	 MM$
s   A/A1r#   zCan not mix devices c              3     K   | ]9  }t        |j                  j                        r|j                  j                   ; y wr7   )r   rS   r2  r4  s     r1   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>8  s4      &--,,- ""s   ?Acuda)
r   r   nextr   itercurrent_devicerS   r%   benchmark_gpusynchronize)	rU   r  r	  r
  device_idx_setdevice_typer   
device_idxr  s	            r1   r  z GPUDeviceBenchmarkMixin.do_bench)  s     $ $
/M/3/$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0 	+++B/C((*	+ 
		+ 
s   &CCr!  r,   r-   r.   r  r/   r0   r1   r-  r-  (  s*    
 '+	 % $	
 
r0   r-  c                  $    e Zd Zdd	 	 	 	 	 ddZy)CPUDeviceBenchmarkMixinNr  c               ,    t        j                  |      S r7   )r%   benchmark_cpur  s       r1   r  z CPUDeviceBenchmarkMixin.do_benchL  s     ((,,r0   r!  r@  r/   r0   r1   rB  rB  K  s*    
 '+	- %- $	-
 
-r0   rB  c                       e Zd Z	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZd ZddZ xZS )	TritonBenchmarkRequestc                    t         |   ||||       || _        || _        || _        || _        |	| _        |
| _        || _        || _	        || _
        y r7   )superrV   module_pathmodule_cache_key
num_stages	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpack)rU   r  r  r   r  rI  rJ  rK  rL  rM  rN  rO  rP  rQ  	__class__s                 r1   rV   zTritonBenchmarkRequest.__init__X  s_      	&79KZX& 0$"#6 %:"$8!(
r0   c               ~   t        j                  | j                  | j                        }t        j                  d| j                  | j                         t        || j                        j                  }t        | j                        }d|j                  _        i }dd l}d|j                  |      j                  v rd|d<   |j                   j"                  dk(  rd}nP|j                   j"                  }	t%        |	      }
|
j'                  | j(                  j                   j*                        }t-        t        || j                        t.        j0                  j2                  j4                  j6                        r!t9        j:                  |g|||i |d|iS t9        j:                  |g|||i ||ddS )	Nz"benchmark module key: %s, path: %sFr   warmupcpustreamT)rV  benchmark_run)r   load_by_key_pathrJ  rI  rE   rF   r   r  runr   r  __self__with_bandwidth_infoinspect	signature
parametersrS   r2  r   get_raw_streamr   r3  r   r0  	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rU   r	  r
  mod
run_methodr  
warmup_argr\  rV  r>  r   s              r1   r  z"TritonBenchmarkRequest.make_run_fns  s    **4+@+@$BRBRS0!!	
 S$"2"2377
$//*
27
/ 
w((4???#(Jx ::??e#F**//K7D%44''..44F C))*OO##55DD
 $$  	
    $$  	
  " r0   c                    t        j                  | j                  | j                        }t	        || j
                        j                          y r7   )r   rX  rJ  rI  r   r  
precompile)rU   rf  s     r1   rj  z!TritonBenchmarkRequest.precompile  s9    **4+@+@$BRBRST%%&113r0   c                T    d| j                   d| j                  d| j                  S )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r  rI  rJ  r}   s    r1   __str__zTritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr0   )r   r   r   r   r   )r  rn   r  r   r   r   r  r  rI  rn   rJ  rn   rK  r   rL  r   rM  r   rN  r   rO  r   rP  r   rQ  r   r   r   r   r   rn   )r,   r-   r.   rV   r  rj  rm  __classcell__rR  s   @r1   rF  rF  U  s     $%%&$% ? @	
 "     !  # "   
64*41=4	4l4Ur0   rF  c                      e Zd Zy)TritonGPUBenchmarkRequestNr+   r/   r0   r1   rr  rr    r2   r0   rr  c                      e Zd Zy)TritonCPUBenchmarkRequestNr+   r/   r0   r1   rt  rt    r2   r0   rt  c                       e Zd ZdZ	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZdd	 	 	 d fdZddZd Zdd	Z	 xZ
S )ExternKernelBenchmarkRequesta*  
    A class to handle extern kernel benchmark requests. This allows extern kernels
    (like aten::mm) to be benchmarked in a subprocess, similar to Triton kernels.

    Important: Instances of this class have to be serializable across
    process boundaries. Do not put CUDA Tensors in here!
    Nc                \    t         |   ||||       || _        |xs i | _        || _        y r7   )rH  rV   callable_pathkwargshas_out_variant)	rU   r  r  r   r  rx  ry  rz  rR  s	           r1   rV   z%ExternKernelBenchmarkRequest.__init__  s5     	&79KZX*l.r0   c                   | j                         }| j                  rt        j                  |g|d|iS t        j                  |g| S )Nr	  )to_callablerz  rd  re  )rU   r	  r
  r  s       r1   r  z(ExternKernelBenchmarkRequest.make_run_fn  sN     $$RA-ASAA $$R8-88r0   r  c                  ||j                         dk(  ry| j                  st              dk(  rt        |   d|iS | j                           }|tt        j                  j                  j                  j                  |t        |j                               t        |j                                      |j                  |       t        j                   rt#        fd      S t%        j                  i       S )Nr   r+  r	  c                        S r7   r/   )algor
  s   r1   <lambda>z8ExternKernelBenchmarkRequest.benchmark.<locals>.<lambda>  s    m8L r0   )numelrz  r   rH  r   r|  r0  _C_dynamoguardsassert_size_strider  sizestridecopy_r$   ro   r   r%   )rU   r	  r
  out_newr  rR  s     ` @r1   r   z&ExternKernelBenchmarkRequest.benchmark  s     ?syy{a/3}#5#:7$m===##%DM*G  ''::U388:.cjjl0C 		'"EE/0LMM((}bAAr0   c                     y r7   r/   r}   s    r1   rj  z'ExternKernelBenchmarkRequest.precompile  s    r0   c                    ddl m} t        || j                        }| j                  r t        j                  |fi | j                  S |S )Nr   )extern_kernels) torch._inductor.select_algorithmr  r   r  ry  rd  re  )rU   r  r  s      r1   r|  z(ExternKernelBenchmarkRequest.to_callable  s@     	D^T%5%56;;$$R74;;77	r0   c                "    d| j                    dS )NzExternKernelBenchmarkRequest())rx  r}   s    r1   rm  z$ExternKernelBenchmarkRequest.__str__   s    .t/A/A.B!DDr0   )NT)r  rn   r  r   r   r   r  r  rx  rn   ry  zOptional[dict[str, Any]]rz  r   r   r   r   )r
  r   r	  r"  r   rn  )r,   r-   r.   r   rV   r  r   rj  r|  rm  ro  rp  s   @r1   rv  rv    s     ,0 $// ?/ @	/
 "/ / )/ / 
/	9*	91=	9		9 KOB*B1GB(
Er0   rv  c                      e Zd Zy)ExternKernelGPUBenchmarkRequestNr+   r/   r0   r1   r  r         	r0   r  c                      e Zd Zy)ExternKernelCPUBenchmarkRequestNr+   r/   r0   r1   r  r  
  r  r0   r  c                  t     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd Z	 	 	 	 	 	 d
dZddZd ZddZ	ddZ
 xZS )CUDABenchmarkRequestae  
    A class to handle CUDA (CUTLASS) benchmark requests. This class is for
    managing the lifecycle of a CUDA kernel benchmark, including compiling
    the source code, managing workspace memory, and executing the kernel.

    Important: Instances of this class have to be serializable across
    process boundaries. Do not put CUDA Tensors in here!
    c                    t         |   ||||       || _        d| _        d | _        d | _        d| _        d| _        d| _        t        j                  | j                  d      \  | _        | _        y )Nr   F so)rH  rV   source_codeworkspace_size	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerU   r  r  r   r  r  rR  s         r1   rV   zCUDABenchmarkRequest.__init__  sr     	&79KZX&#$15)-',$ "*7*=*=d>N>NPT*U't'r0   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y)z
        Precompile the CUDA source code to populate the CUDACodeCache.
        This may happen in a separate thread pool.
        Precompiling %sr  Done precompiling %sN)rE   rF   r   compiler  r}   s    r1   rj  zCUDABenchmarkRequest.precompile,  s<    
 	.5d..53T:r0   c          	        | j                          | j                          t        |      |gz   D cg c]  }t        |j	                                }}t
        j                  d| j                  | j                  | j                  | j                  || j                         t        t        j                  j                         j                        }t!        | j                  | j                        }t        d      }| j"                  dkD  rht        j$                  | j"                  dz   dz  t        j&                  |j(                        | _        t        | j*                  j	                               }t-        j.                  |g|| j                  d|| }	  |        |S c c}w # t0        $ r,}	t3        |	      fd}
| j5                          |
cY d}	~	S d}	~	ww xY w)zc
        Create a function to run the CUDA kernel with the given input and output tensors.
        zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         )r   rS   Nc                     t               r7   )RuntimeError)err_msgs   r1   raise_runtime_errorz=CUDABenchmarkRequest.make_run_fn.<locals>.raise_runtime_errorc  s    "7++r0   )ensure_dll_loadedupdate_workspace_sizer   r
   data_ptrrE   rF   r  r  r  r  r  r0  r7  current_streamcuda_streamr   r  zerosfloat64rS   r  rd  re  r  rn   r  )rU   r	  r
  r5  args
stream_ptrrg  workspace_ptrretrA   r  r  s              @r1   r  z CUDABenchmarkRequest.make_run_fn5  s    	 ""$:>}:MQTPU:UV*+VVMMHHOO	
 ejj779EEF
TXXt'7'78
 ""[[$$q(Q.mmzzDN
 %T^^%<%<%>?M 

 __
 	

 
 
		'E 
W WD  	'!fG, !&&	's#    F'F, ,	G!5!GG!G!c           
        | j                   ry | j                          t        t        j	                  d | j
                  D                    }t        |dz         D cg c]  }t        d        }}t        t        j                  j                         j                        }t        | j                  | j                        }t               } |g || j                   t#        |      d |  t        j                  j%                          |j&                  | _        t*        j-                  d| j(                  | j                  | j.                  | j0                  | j                  || j                          d| _         y c c}w )Nc              3  4   K   | ]  }|j                     y wr7   )r   )r   metas     r1   r   z=CUDABenchmarkRequest.update_workspace_size.<locals>.<genexpr>p  s     G$))Gs   r#   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r  r  r   r   fromkeysr  r   r
   r0  r7  r  r  r   r  r  r	   r  r   r<  valuer  rE   rF   r  r  )rU   unique_input_countr   r  r  rg  c_workspace_sizes          r1   r  z*CUDABenchmarkRequest.update_workspace_sizek  sU   ''  MMG0F0FGG
 )..@1.D(EF1FFejj779EEF
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	

 .44 hMMHHOO		
 (,$7 Gs   E>c                    | j                   4t        j                  | j                  d      \  | _         | _        | _        y y )Nr  )r  r   rQ   r  r  r  r}   s    r1   r  z&CUDABenchmarkRequest.ensure_dll_loaded  s:    888E8J8J  $95DHdmT%5 r0   c                l    | j                   !| j                   j                          d | _         d | _        y r7   )r  rs   r  r}   s    r1   r  z#CUDABenchmarkRequest.cleanup_run_fn  s(    88HHNNDHr0   c                T    d| j                   d| j                  d| j                  S )Nrl  z, self.source_file=z, self.hash_key=)r  r  r  r}   s    r1   rm  zCUDABenchmarkRequest.__str__  s0    #$""$$8t'7'7&99JDMM;KLLr0   r  rn   r  r   r   r   r  r  r  rn   r   r   r   r   rn  )r,   r-   r.   r   rV   rj  r  r  r  r  rm  ro  rp  s   @r1   r  r    s    VV ?V @	V
 "V V 
V$;4*41=4	4l",HMr0   r  c                  Z     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 ddZddZ xZS )CppBenchmarkRequestc                f    t         |   ||||       || _        t        |      | _        d | _        y r7   )rH  rV   r  r   r  r  r  s         r1   rV   zCppBenchmarkRequest.__init__  s5     	&79KZX& -6:r0   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y )Nr  rU  r>  r  )rE   rF   r   rQ   r  r}   s    r1   rj  zCppBenchmarkRequest.precompile  s<     	.5$**>3T:r0   c               \   t        j                  | j                  d      | _        t	        |      |gz   D cg c]  }|j                          }}t        j                  d| j                  | j                  || j                         t        | j                  | j                        }t        d | j                  D              sJ t        j                  gt        |      t        t	        | j                              z   z  |_        t!        j"                  |g|| j                   S c c}w )NrU  r  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  P   K   | ]  }t        |t        j                           y wr7   )r   ctypesc_ulonglong)r   args     r1   r   z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>  s     R3:c6#5#56Rs   $&)r   rQ   r  r  r   r  rE   rF   r  r  r   r   r  r  r   argtypesrd  re  )rU   r	  r
  r5  r  rg  s         r1   r  zCppBenchmarkRequest.make_run_fn  s     $$T%5%55I04]0Cse0KLf!LLXHHOO	
 TXXt'7'78
R$//RRRR%112ID122


   

 __
 	
! Ms   D)c                     d| j                   S )Nrl  )r  r}   s    r1   rm  zCppBenchmarkRequest.__str__  s    #$""$%%r0   r  r   rn  )r,   r-   r.   rV   rj  r  rm  ro  rp  s   @r1   r  r    sj    ;; ?; @	;
 "; ; 
;;
*
1=
	
6&r0   r  c                  P     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZ xZS )CuteDSLBenchmarkRequestz;Benchmark request for CuteDSL (CUTLASS Python DSL) kernels.c                    t         |   ||||       |j                         }t        j                  |      \  | _        | _        y r7   )rH  rV   finalize_allr   r  rJ  rI  )rU   r  r  r   r  r  finalized_coderR  s          r1   rV   z CuteDSLBenchmarkRequest.__init__  sC     	&79KZX$1132=2C2CN2S/t/r0   c          	     T  	 t        j                  | j                  | j                        }ddlm} | j                   d| }t        ||      s?t        |      D cg c]  }t        t        ||            s| }}t        d| d|       t        ||      		fd}|S c c}w )z
        Create a function to run the CuteDSL kernel with the given input and output tensors.
        Similar to TritonBenchmarkRequest.make_run_fn but for CuteDSL kernels.
        r#   )MAIN_SUFFIXr   z-Could not find CuteDSL main kernel function 'z'. Available callables: c                 ~    t        d      } | j                  j                  j                        } g d|iS )Nr7  rV  )r   r_  rS   r3  )r   rV  r
  kernel_funcr	  s     r1   
run_kernelz7CuteDSLBenchmarkRequest.make_run_fn.<locals>.run_kernel  s@    7?%44SZZ5E5EFFBBsB6BBr0   )r   rX  rJ  rI  codegen.cutedsl.cutedsl_kernelr  r  hasattrdircallabler   r  )
rU   r	  r
  rf  r  main_func_namer   	availabler  r  s
    ``      @r1   r  z#CuteDSLBenchmarkRequest.make_run_fn  s     **4+@+@$BRBRS 	@ ,,-Q{m<sN+*-c(S$hwsD?Q6RSIS??OOghqgrs  c>2	C
  Ts   B%9B%)r  rn   r  r   r   r   r  ztuple[Any, ...]r  r!   r   r   r   )r,   r-   r.   r   rV   r  ro  rp  s   @r1   r  r    sj    ETT ?T @	T
 $T #T 
T*1=	r0   r  c                 X    t               } t        j                  | j                         | S r7   )r   atexitrj   r   )pools    r1   get_tuning_process_poolr    s    D
OODMM"Kr0   c                4    t               j                  |       S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )r  r   )r   s    r1   benchmark_in_sub_processr    s     #$..w77r0   )r   r   r   )^
__future__r   r  r  dataclassesrd  r  r9   rL   r   rg   rp   rl   r  r   collections.abcr   r   r   concurrent.futuresr   r   r	   r
   r   typingr   r   r   r   r   r0  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   r   r   r   torch._loggingr   torch.utils._ordered_setr   typesr    r  r!   r"   r  r$   runtime.benchmarkingr%   virtualizedr&   r'   r,   rE   r<   r*   r4   r   r   r   LayoutOrBuffer	dataclassr   r   r$  r-  rB  rF  rr  rt  rv  r  r  r  r  r  cacher  r  r/   r0   r1   <module>r     s   "      	     
   8 8 1 2 2 : :  $ C .    - /  T  -  . "8\:		 	t tnl l^ ryy"))+, 3
 3
 3
l b b bJ, D   F- -YU- YUx	 79O 		 79O 	HE#3 HEV	9		9	LM24D LM^4&13C 4&n+57G +\  8'8&8r0   