
    ci_                         d dl Z d dlmZ d dlmZ ddZe j                  d        Ze j                  ddej                  fd       Z
e j                  ddej                  fd       Zy)	    N)TensorDescriptorc                    t        |      }t        | j                        }t        |      }|dk  r||z  }d|cxk  r|dz
  k  sJ d        J d       |dk  sJ d       t        |      |k(  sJ d       d}d}||   |k  sJ d	       |||<   | j                  |      }d
|z
  |gt	        |      D cg c]  }| j                  |       c}z   }	||g|z   }
ddg|z   }t        | |
|	|      S c c}w )a  
    Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
    which behaves like a concatenation (along the first axis) of subarrays
    of potentially unequal size.

    The load_ragged and store_ragged device functions can be used to read
    and write from subarrays T[batch_offset : batch_offset + batch_size]
    with hardware bounds-checking preventing any sort of leakage outside
    the subarray.
    r      zlast dimension cannot be ragged   z<read-write ragged descriptors must have at most 3 dimensionsz1block shape must have same length as tensor shapei     @z#number of rows may not exceed 2**30l        )listshapelenstrideranger   )Tblock_shape
ragged_dimtensor_shaperankmax_intbillionragged_stridei
tma_stride	tma_shape	box_shapes               Q/var/www/html/engine/venv/lib/python3.12/site-packages/triton/tools/ragged_tma.pycreate_ragged_descriptorr      s)    {#K=L|DA~d

%TAX%H'HH%H'HH%19TTT9{t#X%XX#GG
#w.U0UU.&LHHZ(M -'7PUVZP[:\1188A;:\\J7#l2IQ+%IAy*i@@	 ;]s   0C)c                 *    d}||z
  |z   }| |z   }|||fS )z;
    Helper function for load_ragged and store_ragged.
    r    )batch_offset
batch_sizerowr   xys         r   to_ragged_indicesr"   0   s.     G*s"Az!AAq=    r   c                 0   t        j                  t        | j                        t        |      dz   k(  d       t	        ||||         \  }}}| j                  ||g|d| z   |gz   ||dz   d z         }t        j                  ||j                  dd       }|S )z
    Read from a subarray T[batch_offset : batch_offset + batch_size] with
    hardware bounds-checking, where reading outside the subarray gives zeros.

    Coords should be an appropriately-sized list of integers, just like in
    TMA.load().
       z*TMA must be a read-write ragged descriptorNr   )tlstatic_assertr
   r	   r"   loadreshape)	TMAr   r   coordsr   c0c1c2datas	            r   load_raggedr0   =   s     S^s6{Q68de"<VJ=OPJBB88RHvkz22bT9F:PQ>?<SSTD::dDJJqrN+DKr#   c                     t        ||||         \  }}}t        j                  |ddg|j                  z         }| j	                  ||g|d| z   |gz   ||dz   d z   |       y)a  
    Write to a subarray T[batch_offset : batch_offset + batch_size] with
    hardware bounds-checking, where writes outside the subarray are masked
    correctly.

    Coords should be an appropriately-sized list of integers, just like in
    TMA.store().
    r   N)r"   r&   r)   r	   store)	r*   r   r   r+   r/   r   r,   r-   r.   s	            r   store_raggedr3   O   so     #<VJ=OPJBB::dQFTZZ/0DIIr2h,,t3fZ!^_6MMtTr#   )r   )tritontriton.languagelanguager&   triton.tools.tensor_descriptorr   r   jitr"   	constexprr0   r3   r   r#   r   <module>r:      sv      ;
%AP 	 	 2<<  " U",, U Ur#   