Hardware Acceleration & GPU Programming
Modern Mobile GPU Architecture
GPU vs CPU Performance Characteristics
Performance Comparison:
┌─────────────────┬─────────────┬─────────────┐
│ Aspect │ CPU │ GPU │
├─────────────────┼─────────────┼─────────────┤
│ Cores │ 4-12 │ 1000-3000 │
│ Clock Speed │ 2-3 GHz │ 400-800 MHz │
│ Memory │ Large Cache │ High BW │
│ Parallelism │ Task │ Data │
│ Power Efficiency│ Variable │ High │
└─────────────────┴─────────────┴─────────────┘
Android Hardware Acceleration
RenderScript for Parallel Processing
kotlin
// rs_blur_kernel.rs
#pragma version(1)
#pragma rs java_package_name(com.example.graphics)
rs_allocation input;
rs_allocation output;
int width;
int height;
float radius;
void RS_KERNEL blur_kernel(uint32_t x, uint32_t y) {
float4 sum = 0;
int count = 0;
int radiusInt = (int)radius;
for (int dy = -radiusInt; dy <= radiusInt; dy++) {
for (int dx = -radiusInt; dx <= radiusInt; dx++) {
int nx = x + dx;
int ny = y + dy;
if (nx >= 0 && nx < width && ny >= 0 && ny < height) {
float4 pixel = rsUnpackColor8888(
rsGetElementAt_uint32_t(input, nx, ny)
);
sum += pixel;
count++;
}
}
}
sum /= count;
rsSetElementAt_uint32_t(output, rsPackColorTo8888(sum), x, y);
}
kotlin
// Kotlin RenderScript Integration
class GPUImageProcessor(private val context: Context) {
private val renderScript = RenderScript.create(context)
private lateinit var blurScript: ScriptC_rs_blur_kernel
init {
blurScript = ScriptC_rs_blur_kernel(renderScript)
}
fun applyGaussianBlur(bitmap: Bitmap, radius: Float): Bitmap {
val inputAllocation = Allocation.createFromBitmap(renderScript, bitmap)
val outputAllocation = Allocation.createTyped(renderScript, inputAllocation.type)
// Set script parameters
blurScript._input = inputAllocation
blurScript._output = outputAllocation
blurScript._width = bitmap.width
blurScript._height = bitmap.height
blurScript._radius = radius
// Launch kernel
blurScript.forEach_blur_kernel(outputAllocation)
// Copy result back
val outputBitmap = Bitmap.createBitmap(
bitmap.width,
bitmap.height,
Bitmap.Config.ARGB_8888
)
outputAllocation.copyTo(outputBitmap)
// Cleanup
inputAllocation.destroy()
outputAllocation.destroy()
return outputBitmap
}
fun cleanup() {
blurScript?.destroy()
renderScript.destroy()
}
}
Vulkan API Implementation
kotlin
class VulkanRenderer {
private var instance: Long = 0
private var device: Long = 0
private var commandPool: Long = 0
private var queue: Long = 0
external fun createVulkanInstance(): Long
external fun createDevice(instance: Long): Long
external fun createCommandPool(device: Long): Long
external fun submitCommands(commandPool: Long, commands: ByteArray)
companion object {
init {
System.loadLibrary("vulkan-renderer")
}
}
fun initialize(): Boolean {
return try {
instance = createVulkanInstance()
device = createDevice(instance)
commandPool = createCommandPool(device)
true
} catch (e: Exception) {
Log.e("Vulkan", "Initialization failed", e)
false
}
}
fun renderFrame(vertices: FloatArray, indices: IntArray) {
val commandBuffer = buildCommandBuffer(vertices, indices)
submitCommands(commandPool, commandBuffer)
}
private fun buildCommandBuffer(vertices: FloatArray, indices: IntArray): ByteArray {
// Build Vulkan command buffer
// This would typically be done in native code
return byteArrayOf()
}
}
cpp
// Native Vulkan Implementation (vulkan-renderer.cpp)
#include <jni.h>
#include <vulkan/vulkan.h>
#include <android/log.h>
#define LOG_TAG "VulkanRenderer"
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
extern "C" {
JNIEXPORT jlong JNICALL
Java_com_example_VulkanRenderer_createVulkanInstance(JNIEnv *env, jobject thiz) {
VkApplicationInfo appInfo{};
appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
appInfo.pApplicationName = "Mobile Graphics App";
appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.pEngineName = "Custom Engine";
appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.apiVersion = VK_API_VERSION_1_0;
VkInstanceCreateInfo createInfo{};
createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
createInfo.pApplicationInfo = &appInfo;
VkInstance instance;
VkResult result = vkCreateInstance(&createInfo, nullptr, &instance);
if (result != VK_SUCCESS) {
LOGE("Failed to create Vulkan instance: %d", result);
return 0;
}
LOGI("Vulkan instance created successfully");
return reinterpret_cast<jlong>(instance);
}
JNIEXPORT jlong JNICALL
Java_com_example_VulkanRenderer_createDevice(JNIEnv *env, jobject thiz, jlong instancePtr) {
VkInstance instance = reinterpret_cast<VkInstance>(instancePtr);
// Enumerate physical devices
uint32_t deviceCount = 0;
vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);
if (deviceCount == 0) {
LOGE("No Vulkan-compatible devices found");
return 0;
}
std::vector<VkPhysicalDevice> devices(deviceCount);
vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());
// Select first suitable device (simplified)
VkPhysicalDevice physicalDevice = devices[0];
// Create logical device
float queuePriority = 1.0f;
VkDeviceQueueCreateInfo queueCreateInfo{};
queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
queueCreateInfo.queueFamilyIndex = 0; // Simplified
queueCreateInfo.queueCount = 1;
queueCreateInfo.pQueuePriorities = &queuePriority;
VkDeviceCreateInfo createInfo{};
createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
createInfo.queueCreateInfoCount = 1;
createInfo.pQueueCreateInfos = &queueCreateInfo;
VkDevice device;
VkResult result = vkCreateDevice(physicalDevice, &createInfo, nullptr, &device);
if (result != VK_SUCCESS) {
LOGE("Failed to create Vulkan device: %d", result);
return 0;
}
LOGI("Vulkan device created successfully");
return reinterpret_cast<jlong>(device);
}
}
iOS Metal Framework Integration
Metal Performance Shaders (MPS)
swift
import Metal
import MetalPerformanceShaders
class MetalImageProcessor {
private let device: MTLDevice
private let commandQueue: MTLCommandQueue
private let library: MTLLibrary
init() {
guard let device = MTLCreateSystemDefaultDevice(),
let commandQueue = device.makeCommandQueue(),
let library = device.makeDefaultLibrary() else {
fatalError("Metal not supported on this device")
}
self.device = device
self.commandQueue = commandQueue
self.library = library
}
func applyGaussianBlur(to texture: MTLTexture, radius: Float) -> MTLTexture? {
guard let commandBuffer = commandQueue.makeCommandBuffer() else {
return nil
}
// Create output texture
let descriptor = MTLTextureDescriptor.texture2DDescriptor(
pixelFormat: texture.pixelFormat,
width: texture.width,
height: texture.height,
mipmapped: false
)
descriptor.usage = [.shaderRead, .shaderWrite]
guard let outputTexture = device.makeTexture(descriptor: descriptor) else {
return nil
}
// Apply Gaussian blur using MPS
let gaussianBlur = MPSImageGaussianBlur(device: device, sigma: radius)
gaussianBlur.encode(
commandBuffer: commandBuffer,
sourceTexture: texture,
destinationTexture: outputTexture
)
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
return outputTexture
}
func performMatrixMultiplication(matrixA: [Float], matrixB: [Float],
rowsA: Int, colsA: Int, colsB: Int) -> [Float]? {
guard let commandBuffer = commandQueue.makeCommandBuffer() else {
return nil
}
// Create buffers
guard let bufferA = device.makeBuffer(bytes: matrixA,
length: matrixA.count * MemoryLayout<Float>.size,
options: []),
let bufferB = device.makeBuffer(bytes: matrixB,
length: matrixB.count * MemoryLayout<Float>.size,
options: []),
let resultBuffer = device.makeBuffer(length: rowsA * colsB * MemoryLayout<Float>.size,
options: []) else {
return nil
}
// Use MPS matrix multiplication
let matrixMult = MPSMatrixMultiplication(
device: device,
transposeLeft: false,
transposeRight: false,
resultRows: rowsA,
resultColumns: colsB,
interiorColumns: colsA,
alpha: 1.0,
beta: 0.0
)
let descA = MPSMatrixDescriptor(rows: rowsA, columns: colsA,
rowBytes: colsA * MemoryLayout<Float>.size,
dataType: .float32)
let descB = MPSMatrixDescriptor(rows: colsA, columns: colsB,
rowBytes: colsB * MemoryLayout<Float>.size,
dataType: .float32)
let descResult = MPSMatrixDescriptor(rows: rowsA, columns: colsB,
rowBytes: colsB * MemoryLayout<Float>.size,
dataType: .float32)
let mpsMatrixA = MPSMatrix(buffer: bufferA, descriptor: descA)
let mpsMatrixB = MPSMatrix(buffer: bufferB, descriptor: descB)
let mpsResult = MPSMatrix(buffer: resultBuffer, descriptor: descResult)
matrixMult.encode(commandBuffer: commandBuffer,
leftMatrix: mpsMatrixA,
rightMatrix: mpsMatrixB,
resultMatrix: mpsResult)
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
// Extract result
let resultPointer = resultBuffer.contents().bindMemory(to: Float.self, capacity: rowsA * colsB)
return Array(UnsafeBufferPointer(start: resultPointer, count: rowsA * colsB))
}
}
Custom Metal Compute Shaders
swift
// Metal Compute Pipeline
class MetalComputeEngine {
private let device: MTLDevice
private let commandQueue: MTLCommandQueue
private let library: MTLLibrary
init() {
guard let device = MTLCreateSystemDefaultDevice(),
let commandQueue = device.makeCommandQueue(),
let library = device.makeDefaultLibrary() else {
fatalError("Metal initialization failed")
}
self.device = device
self.commandQueue = commandQueue
self.library = library
}
func parallelSum(data: [Float]) -> Float {
guard let function = library.makeFunction(name: "parallel_sum"),
let pipeline = try? device.makeComputePipelineState(function: function),
let commandBuffer = commandQueue.makeCommandBuffer(),
let encoder = commandBuffer.makeComputeCommandEncoder() else {
return 0.0
}
// Create buffers
let inputBuffer = device.makeBuffer(bytes: data,
length: data.count * MemoryLayout<Float>.size,
options: [])!
let outputCount = (data.count + 255) / 256 // Number of thread groups
let outputBuffer = device.makeBuffer(length: outputCount * MemoryLayout<Float>.size,
options: [])!
var count = UInt32(data.count)
let countBuffer = device.makeBuffer(bytes: &count,
length: MemoryLayout<UInt32>.size,
options: [])!
// Configure compute encoder
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(inputBuffer, offset: 0, index: 0)
encoder.setBuffer(outputBuffer, offset: 0, index: 1)
encoder.setBuffer(countBuffer, offset: 0, index: 2)
// Dispatch threads
let threadsPerGroup = MTLSize(width: 256, height: 1, depth: 1)
let numGroups = MTLSize(width: outputCount, height: 1, depth: 1)
encoder.dispatchThreadgroups(numGroups, threadsPerThreadgroup: threadsPerGroup)
encoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
// Sum partial results on CPU (could be further optimized)
let resultPointer = outputBuffer.contents().bindMemory(to: Float.self, capacity: outputCount)
var totalSum: Float = 0.0
for i in 0..<outputCount {
totalSum += resultPointer[i]
}
return totalSum
}
}
Performance Monitoring & Optimization
GPU Memory Management
kotlin
// Android GPU Memory Tracking
class GPUMemoryManager {
private val memoryUsage = mutableMapOf<String, Long>()
private val activityManager = context.getSystemService(Context.ACTIVITY_SERVICE) as ActivityManager
fun trackTextureMemory(name: String, width: Int, height: Int, bytesPerPixel: Int) {
val memorySize = (width * height * bytesPerPixel).toLong()
memoryUsage[name] = memorySize
logMemoryUsage()
}
fun releaseTexture(name: String) {
memoryUsage.remove(name)
logMemoryUsage()
}
private fun logMemoryUsage() {
val totalGPUMemory = memoryUsage.values.sum()
val memInfo = ActivityManager.MemoryInfo()
activityManager.getMemoryInfo(memInfo)
Log.d("GPU Memory", "Total GPU texture memory: ${totalGPUMemory / (1024 * 1024)} MB")
Log.d("GPU Memory", "Available system memory: ${memInfo.availMem / (1024 * 1024)} MB")
if (totalGPUMemory > memInfo.availMem * 0.1) { // 10% of available memory
Log.w("GPU Memory", "High GPU memory usage detected - consider texture compression")
}
}
fun optimizeTextureFormats(): Map<String, String> {
return mapOf(
"RGBA8888" to "Use for high-quality images with transparency",
"RGB565" to "Use for opaque images to save 50% memory",
"ETC2" to "Use compressed format for Android (OpenGL ES 3.0+)",
"ASTC" to "Use for high-end devices with ASTC support",
"S3TC/DXT" to "Use for desktop/high-end mobile GPUs"
)
}
}
GPU Bellek Yönetimi ve Optimizasyon
Bellek Yönetimi Stratejileri
kotlin
// Android GPU Bellek Yönetimi
class GPUBellekYoneticisi {
private val bellekKullanim = mutableMapOf<String, Long>()
private val aktiviteYoneticisi = context.getSystemService(Context.ACTIVITY_SERVICE) as ActivityManager
fun dokuBellekTakibi(isim: String, genislik: Int, yukseklik: Int, pikselBasinaByte: Int) {
val bellekBoyutu = (genislik * yukseklik * pikselBasinaByte).toLong()
bellekKullanim[isim] = bellekBoyutu
bellekKullaniminiLogla()
}
fun dokuyuSerbestBirak(isim: String) {
bellekKullanim.remove(isim)
bellekKullaniminiLogla()
}
private fun bellekKullaniminiLogla() {
val toplamGPUBellek = bellekKullanim.values.sum()
val bellekBilgisi = ActivityManager.MemoryInfo()
aktiviteYoneticisi.getMemoryInfo(bellekBilgisi)
Log.d("GPU Bellek", "Toplam GPU doku belleği: ${toplamGPUBellek / (1024 * 1024)} MB")
Log.d("GPU Bellek", "Kullanılabilir sistem belleği: ${bellekBilgisi.availMem / (1024 * 1024)} MB")
if (toplamGPUBellek > bellekBilgisi.availMem * 0.1) { // Kullanılabilir belleğin %10'u
Log.w("GPU Bellek", "Yüksek GPU bellek kullanımı tespit edildi - doku sıkıştırma düşünülebilir")
}
}
fun dokuFormatlariniOptimizeEt(): Map<String, String> {
return mapOf(
"RGBA8888" to "Şeffaflık içeren yüksek kaliteli görüntüler için kullan",
"RGB565" to "Şeffaf olmayan görüntüler için %50 bellek tasarrufu sağlar",
"ETC2" to "Android için sıkıştırılmış format (OpenGL ES 3.0+)",
"ASTC" to "ASTC destekli üst düzey cihazlar için kullan",
"S3TC/DXT" to "Masaüstü/üst düzey mobil GPU'lar için kullan"
)
}
}
Asenkron GPU İşlemleri
kotlin
// Android - Asenkron GPU işleme
class AsenkronGPUIsleyici {
private val glExecutor = Executors.newSingleThreadExecutor()
private val hesaplamaExecutor = Executors.newFixedThreadPool(2)
fun asenkronIsle(
girdiVerisi: FloatArray,
geriCagri: (FloatArray) -> Unit
) {
hesaplamaExecutor.submit {
try {
val sonuc = GPUdaIsle(girdiVerisi)
// Sonucu ana thread'e döndür
Handler(Looper.getMainLooper()).post {
geriCagri(sonuc)
}
} catch (e: Exception) {
Log.e("AsenkronGPU", "İşleme başarısız", e)
}
}
}
private fun GPUdaIsle(veri: FloatArray): FloatArray {
// GPU işleme implementasyonu
return veri.map { it * 2.0f }.toFloatArray()
}
fun topluIsle(
topluVeriler: List<FloatArray>,
ilerlemeGeriCagri: (Int, Int) -> Unit,
tamamlanmaGeriCagri: (List<FloatArray>) -> Unit
) {
val sonuclar = mutableListOf<FloatArray>()
val toplamToplu = topluVeriler.size
topluVeriler.forEachIndexed { index, toplu ->
hesaplamaExecutor.submit {
val sonuc = GPUdaIsle(toplu)
synchronized(sonuclar) {
sonuclar.add(sonuc)
Handler(Looper.getMainLooper()).post {
ilerlemeGeriCagri(index + 1, toplamToplu)
if (sonuclar.size == toplamToplu) {
tamamlanmaGeriCagri(sonuclar.toList())
}
}
}
}
}
}
}
Donanım Tespiti ve Yedekleme Stratejileri
Yetenek Tespiti
typescript
// Çapraz platform donanım yetenek tespiti
interface DonanimYetenekleri {
GPUvar: boolean;
NPUvar: boolean;
DSPvar: boolean;
gpuBellek: number;
hesaplamaBirimleri: number;
desteklenenFormatlar: string[];
}
class DonanimTespitci {
static async yetenekleriAl(): Promise<DonanimYetenekleri> {
if (Platform.OS === 'ios') {
return await this.iOSYetenekleriniAl();
} else {
return await this.androidYetenekleriniAl();
}
}
private static async iOSYetenekleriniAl(): Promise<DonanimYetenekleri> {
const yetenekler = await NativeModules.DonanimTespitci.iOSYetenekleriniAl();
return {
GPUvar: yetenekler.GPUvar,
NPUvar: yetenekler.neuralEngineVar,
DSPvar: yetenekler.imageSignalProcessorVar,
gpuBellek: yetenekler.gpuBellek,
hesaplamaBirimleri: yetenekler.gpuCekirdekleri,
desteklenenFormatlar: yetenekler.desteklenenPikselFormatlari
};
}
private static async androidYetenekleriniAl(): Promise<DonanimYetenekleri> {
const yetenekler = await NativeModules.DonanimTespitci.androidYetenekleriniAl();
return {
GPUvar: yetenekler.GPUvar,
NPUvar: yetenekler.NNAPIvar,
DSPvar: yetenekler.hexagonDSPvar,
gpuBellek: yetenekler.gpuBellek,
hesaplamaBirimleri: yetenekler.gpuCekirdekleri,
desteklenenFormatlar: yetenekler.desteklenenFormatlar
};
}
static optimalIsleyiciSec(
yetenekler: DonanimYetenekleri,
gorevTipi: 'goruntu' | 'ml' | 'hesaplama'
): IsleyiciTipi {
switch (gorevTipi) {
case 'goruntu':
return yetenekler.GPUvar ? 'gpu' : 'cpu';
case 'ml':
return yetenekler.NPUvar ? 'npu' :
yetenekler.GPUvar ? 'gpu' : 'cpu';
case 'hesaplama':
return yetenekler.DSPvar ? 'dsp' :
yetenekler.GPUvar ? 'gpu' : 'cpu';
default:
return 'cpu';
}
}
}
type IsleyiciTipi = 'cpu' | 'gpu' | 'npu' | 'dsp';
En İyi Uygulamalar
Donanım Hızlandırma Kılavuzu
Doğru hızlandırıcıyı seçin
- GPU: Paralel grafik ve hesaplama işlemleri
- NPU: Makine öğrenimi çıkarımı
- DSP: Sinyal işleme ve ses
Bellek optimizasyonu
- GPU tamponlarını havuzda tutun
- CPU-GPU aktarımlarını minimize edin
- Uygun bellek tiplerini kullanın
Asenkron işleme
- Ana thread'i bloklamayın
- Uygun hata yönetimi uygulayın
- İlerleme geri bildirimi sağlayın
Yedekleme stratejileri
- Donanım yeteneklerini tespit edin
- CPU yedeklemeleri uygulayın
- Zarif düşüş sağlayın
Güç verimliliği
- Termal durumu izleyin
- Performans ve pil ömrü dengesini koruyun
- Uyarlanabilir kalite ayarları kullanın
Donanım hızlandırma, performans kritik mobil uygulamalar için gereklidir. Doğru uygulama, farklı cihaz yetenekleri arasında enerji verimliliğini korurken önemli performans iyileştirmeleri sağlayabilir.