|
5 | 5 | */ |
6 | 6 |
|
7 | 7 | #include<array> |
| 8 | +#include<intrin.h> |
8 | 9 |
|
9 | 10 | #include<OvDebug/Logger.h> |
10 | 11 | #include<OvRendering/Resources/Mesh.h> |
@@ -81,38 +82,159 @@ void OvRendering::Resources::Mesh::Upload(std::span<const Geometry::Vertex> p_ve |
81 | 82 | } |
82 | 83 | } |
83 | 84 |
|
84 | | -voidOvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices) |
| 85 | +namespace |
85 | 86 | { |
86 | | -m_boundingSphere.position = OvMaths::FVector3::Zero; |
87 | | -m_boundingSphere.radius =0.0f; |
88 | | - |
89 | | -if (!p_vertices.empty()) |
| 87 | +OvRendering::Geometry::BoundingSphereComputeBoundingSphereSIMD(std::span<const OvRendering::Geometry::Vertex> p_vertices) |
90 | 88 | { |
91 | | -float minX = std::numeric_limits<float>::max(); |
92 | | -float minY = std::numeric_limits<float>::max(); |
93 | | -float minZ = std::numeric_limits<float>::max(); |
| 89 | +constsize_t vertexCount = p_vertices.size(); |
| 90 | + |
| 91 | +if (vertexCount ==0) |
| 92 | +{ |
| 93 | +return { |
| 94 | +.position = OvMaths::FVector3::Zero, |
| 95 | +.radius =0.0f |
| 96 | +}; |
| 97 | +} |
94 | 98 |
|
95 | | -float maxX = std::numeric_limits<float>::min(); |
96 | | -float maxY =std::numeric_limits<float>::min(); |
97 | | -float maxZ =std::numeric_limits<float>::min(); |
| 99 | +// Initialize SIMD registers formin/max with first vertex values |
| 100 | +__m128 vMinXYZ =_mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX); |
| 101 | +__m128 vMaxXYZ =_mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX); |
98 | 102 |
|
99 | | -for (constauto& vertex : p_vertices) |
| 103 | +// Process all vertices in one loop to find min/max |
| 104 | +for (size_t i =1; i < vertexCount; ++i) |
100 | 105 | { |
101 | | -minX =std::min(minX, vertex.position[0]); |
102 | | -minY =std::min(minY, vertex.position[1]); |
103 | | -minZ =std::min(minZ, vertex.position[2]); |
| 106 | +// Load vertex position directly - assumes position is aligned properly |
| 107 | +constfloat* posPtr = p_vertices[i].position; |
| 108 | +__m128 vPos =_mm_loadu_ps(posPtr);// Using loadu in case it's not 16-byte aligned |
| 109 | + |
| 110 | +// Update min and max in one pass |
| 111 | +vMinXYZ =_mm_min_ps(vMinXYZ, vPos); |
| 112 | +vMaxXYZ =_mm_max_ps(vMaxXYZ, vPos); |
| 113 | +} |
| 114 | + |
| 115 | +// Calculate center = (min + max) * 0.5 |
| 116 | +__m128 vCenter =_mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ),_mm_set1_ps(0.5f)); |
| 117 | + |
| 118 | +// Store center position |
| 119 | +float centerArr[4]; |
| 120 | +_mm_store_ps(centerArr, vCenter); |
| 121 | +auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] }; |
| 122 | + |
| 123 | +// Calculate radius - use dot product for distance calculation |
| 124 | +__m128 vMaxDistSq =_mm_setzero_ps(); |
| 125 | + |
| 126 | +// Pre-load center vector once outside the loop |
| 127 | +const __m128 vCenterXYZ =_mm_setr_ps( |
| 128 | +center.x, |
| 129 | +center.y, |
| 130 | +center.z, |
| 131 | +0.0f |
| 132 | +); |
| 133 | + |
| 134 | +// Unroll the loop by 4 for better throughput |
| 135 | +size_t i =0; |
| 136 | +constsize_t unrollCount = vertexCount & ~3ull;// Round down to multiple of 4 |
| 137 | + |
| 138 | +for (; i < unrollCount; i +=4) |
| 139 | +{ |
| 140 | +// Load 4 vertices at once |
| 141 | +constfloat* pos0 = p_vertices[i].position; |
| 142 | +constfloat* pos1 = p_vertices[i +1].position; |
| 143 | +constfloat* pos2 = p_vertices[i +2].position; |
| 144 | +constfloat* pos3 = p_vertices[i +3].position; |
| 145 | + |
| 146 | +__m128 vPos0 =_mm_loadu_ps(pos0); |
| 147 | +__m128 vDiff0 =_mm_sub_ps(vPos0, vCenterXYZ); |
| 148 | +__m128 vDistSq0 =_mm_dp_ps(vDiff0, vDiff0,0x77);// Dot product with mask 0x77 (sum xyz, store in all) |
| 149 | +vMaxDistSq =_mm_max_ps(vMaxDistSq, vDistSq0); |
| 150 | + |
| 151 | +__m128 vPos1 =_mm_loadu_ps(pos1); |
| 152 | +__m128 vDiff1 =_mm_sub_ps(vPos1, vCenterXYZ); |
| 153 | +__m128 vDistSq1 =_mm_dp_ps(vDiff1, vDiff1,0x77); |
| 154 | +vMaxDistSq =_mm_max_ps(vMaxDistSq, vDistSq1); |
| 155 | + |
| 156 | +__m128 vPos2 =_mm_loadu_ps(pos2); |
| 157 | +__m128 vDiff2 =_mm_sub_ps(vPos2, vCenterXYZ); |
| 158 | +__m128 vDistSq2 =_mm_dp_ps(vDiff2, vDiff2,0x77); |
| 159 | +vMaxDistSq =_mm_max_ps(vMaxDistSq, vDistSq2); |
| 160 | + |
| 161 | +__m128 vPos3 =_mm_loadu_ps(pos3); |
| 162 | +__m128 vDiff3 =_mm_sub_ps(vPos3, vCenterXYZ); |
| 163 | +__m128 vDistSq3 =_mm_dp_ps(vDiff3, vDiff3,0x77); |
| 164 | +vMaxDistSq =_mm_max_ps(vMaxDistSq, vDistSq3); |
| 165 | +} |
104 | 166 |
|
105 | | -maxX =std::max(maxX, vertex.position[0]); |
106 | | -maxY =std::max(maxY, vertex.position[1]); |
107 | | -maxZ =std::max(maxZ, vertex.position[2]); |
| 167 | +// Handle remaining vertices |
| 168 | +for (; i < vertexCount; ++i) |
| 169 | +{ |
| 170 | +constfloat* pos = p_vertices[i].position; |
| 171 | +__m128 vPos =_mm_loadu_ps(pos); |
| 172 | +__m128 vDiff =_mm_sub_ps(vPos, vCenterXYZ); |
| 173 | +__m128 vDistSq =_mm_dp_ps(vDiff, vDiff,0x77); |
| 174 | +vMaxDistSq =_mm_max_ps(vMaxDistSq, vDistSq); |
108 | 175 | } |
109 | 176 |
|
110 | | -m_boundingSphere.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } /2.0f; |
| 177 | +// Extract radius (sqrt of max squared distance) |
| 178 | +float maxDistSq; |
| 179 | +_mm_store_ss(&maxDistSq, vMaxDistSq); |
| 180 | + |
| 181 | +return { |
| 182 | +.position = center, |
| 183 | +.radius =std::sqrt(maxDistSq) |
| 184 | +}; |
| 185 | +} |
| 186 | + |
| 187 | +OvRendering::Geometry::BoundingSphereComputeBoundingSphereRegular(std::span<const OvRendering::Geometry::Vertex> p_vertices) |
| 188 | +{ |
| 189 | +auto result = OvRendering::Geometry::BoundingSphere{ |
| 190 | +.position = OvMaths::FVector3::Zero, |
| 191 | +.radius =0.0f |
| 192 | +}; |
111 | 193 |
|
112 | | -for (constauto& vertex :p_vertices) |
| 194 | +if (!p_vertices.empty()) |
113 | 195 | { |
114 | | -constauto& position =reinterpret_cast<const OvMaths::FVector3&>(vertex.position); |
115 | | -m_boundingSphere.radius =std::max(m_boundingSphere.radius,OvMaths::FVector3::Distance(m_boundingSphere.position, position)); |
| 196 | +float minX = std::numeric_limits<float>::max(); |
| 197 | +float minY = std::numeric_limits<float>::max(); |
| 198 | +float minZ = std::numeric_limits<float>::max(); |
| 199 | + |
| 200 | +float maxX = std::numeric_limits<float>::min(); |
| 201 | +float maxY = std::numeric_limits<float>::min(); |
| 202 | +float maxZ = std::numeric_limits<float>::min(); |
| 203 | + |
| 204 | +for (constauto& vertex : p_vertices) |
| 205 | +{ |
| 206 | +minX =std::min(minX, vertex.position[0]); |
| 207 | +minY =std::min(minY, vertex.position[1]); |
| 208 | +minZ =std::min(minZ, vertex.position[2]); |
| 209 | + |
| 210 | +maxX =std::max(maxX, vertex.position[0]); |
| 211 | +maxY =std::max(maxY, vertex.position[1]); |
| 212 | +maxZ =std::max(maxZ, vertex.position[2]); |
| 213 | +} |
| 214 | + |
| 215 | +result.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } /2.0f; |
| 216 | + |
| 217 | +for (constauto& vertex : p_vertices) |
| 218 | +{ |
| 219 | +constauto& position =reinterpret_cast<const OvMaths::FVector3&>(vertex.position); |
| 220 | +result.radius =std::max(result.radius,OvMaths::FVector3::Distance(result.position, position)); |
| 221 | +} |
116 | 222 | } |
| 223 | + |
| 224 | +return result; |
| 225 | +} |
| 226 | +} |
| 227 | + |
| 228 | +voidOvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices) |
| 229 | +{ |
| 230 | +constexprbool useSIMD =true; |
| 231 | + |
| 232 | +ifconstexpr (useSIMD) |
| 233 | +{ |
| 234 | +m_boundingSphere =ComputeBoundingSphereSIMD(p_vertices); |
| 235 | +} |
| 236 | +else |
| 237 | +{ |
| 238 | +m_boundingSphere =ComputeBoundingSphereRegular(p_vertices); |
117 | 239 | } |
118 | 240 | } |