diff --git a/.gitmodules b/.gitmodules index 6e456528..0479e3d8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -34,3 +34,6 @@ [submodule "Plugins/nosWebRTC/Source/ssl-cert"] path = Plugins/nosWebRTC/Source/ssl-cert url = https://github.com/mediaz/ssl-cert.git +[submodule "Plugins/nosUtilities/External/freetype"] + path = Plugins/nosUtilities/External/freetype + url = https://github.com/freetype/freetype.git diff --git a/Plugins/CMakeLists.txt b/Plugins/CMakeLists.txt index 589829f7..63685e54 100644 --- a/Plugins/CMakeLists.txt +++ b/Plugins/CMakeLists.txt @@ -42,6 +42,7 @@ add_subdirectory(nosReflect) add_subdirectory(nosStrings) add_subdirectory(nosAnimation) add_subdirectory(nosGraphics) +add_subdirectory(nosGeometry) nos_get_targets(PLUGINS_COMMON_EXTERNAL_TARGETS "./External") nos_group_targets("${PLUGINS_COMMON_EXTERNAL_TARGETS}" "External") diff --git a/Plugins/nosFilters/Config/BokehDof.nosdef b/Plugins/nosFilters/Config/BokehDof.nosdef new file mode 100644 index 00000000..df5b08fb --- /dev/null +++ b/Plugins/nosFilters/Config/BokehDof.nosdef @@ -0,0 +1,121 @@ +{ + "nodes": [ + { + "class_name": "BokehDof", + "menu_info": { + "category": "Filters", + "display_name": "Bokeh DoF" + }, + "node": { + "class_name": "BokehDof", + "name": "Bokeh DoF", + "description": "Single-pass 2D depth-of-field. CoC is computed from a linear view-space Z input; samples are gathered on a Vogel disc weighted by the BokehShape kernel texture, so bokeh takes the shape painted into BokehShape.", + "contents_type": "Job", + "contents": { + "type": "nos.sys.vulkan.GPUNode", + "options": { + "shader": "Shaders/BokehDof.frag" + } + }, + "pins": [ + { + "name": "Input", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": { + "filtering": "LINEAR" + } + }, + { + "name": "Depth", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": { + "filtering": "NEAREST" + } + }, + { + "name": "BokehShape", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": { + "filtering": "LINEAR" + } + }, + { + "name": "FocusDistance", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 5.0, + "min": 0.0, + "max": 1000.0 + }, + { + "name": "FocusRange", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 2.0, + "min": 0.01, + "max": 1000.0 + }, + { + "name": "MaxRadius", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 16.0, + "min": 0.0, + "max": 128.0 + }, + { + "name": "MinRadius", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 0.5, + "min": 0.0, + "max": 8.0 + }, + { + "name": "BackgroundIsFar", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 1.0, + "min": 0.0, + "max": 1.0 + }, + { + "name": "SampleCount", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 48.0, + "min": 4.0, + "max": 256.0 + }, + { + "name": "KernelRotation", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 0.0, + "min": -6.2832, + "max": 6.2832 + }, + { + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_OR_PROPERTY" + } + ] + } + } + ] +} diff --git a/Plugins/nosFilters/Config/BokehShape.nosdef b/Plugins/nosFilters/Config/BokehShape.nosdef new file mode 100644 index 00000000..3a466b9a --- /dev/null +++ b/Plugins/nosFilters/Config/BokehShape.nosdef @@ -0,0 +1,93 @@ +{ + "nodes": [ + { + "class_name": "BokehShape", + "menu_info": { + "category": "Filters", + "display_name": "Bokeh Shape" + }, + "node": { + "class_name": "BokehShape", + "name": "Bokeh Shape", + "description": "Procedural bokeh kernel generator. Produces a unit-disc grayscale mask shaped like a regular polygon aperture (blade count, roundness, rotation), with soft edge and optional rim brightening. Feed the Output into a Bokeh DoF node's BokehShape pin.", + "contents_type": "Job", + "contents": { + "type": "nos.sys.vulkan.GPUNode", + "options": { + "shader": "Shaders/BokehShape.frag" + } + }, + "pins": [ + { + "name": "BladeCount", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 6.0, + "min": 0.0, + "max": 16.0 + }, + { + "name": "Roundness", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 0.3, + "min": 0.0, + "max": 1.0 + }, + { + "name": "Rotation", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 0.0, + "min": -6.2832, + "max": 6.2832 + }, + { + "name": "EdgeSoftness", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 0.04, + "min": 0.0, + "max": 0.5 + }, + { + "name": "RimBoost", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 0.0, + "min": 0.0, + "max": 4.0 + }, + { + "name": "RimWidth", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 0.08, + "min": 0.005, + "max": 0.5 + }, + { + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_OR_PROPERTY", + "data": { + "resolution": "CUSTOM", + "width": 128, + "height": 128, + "format": "R16_UNORM", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET", + "filtering": "LINEAR" + } + } + ] + } + } + ] +} diff --git a/Plugins/nosFilters/Config/DepthOfField.nosdef b/Plugins/nosFilters/Config/DepthOfField.nosdef new file mode 100644 index 00000000..1db5a70f --- /dev/null +++ b/Plugins/nosFilters/Config/DepthOfField.nosdef @@ -0,0 +1,990 @@ +{ "nodes": [ + { + "class_name": "nos.filters.DepthOfField", + "node": { + "id": "5899940c-437e-4f71-b119-bb80fb5d1e1a", + "name": "DepthOfField", + "class_name": "nos.filters.DepthOfField", + "pins": [ + { + "id": "1950c2e6-a0f6-485b-8a02-bded8a2f6ed5", + "name": "Depth", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED" + }, + "def": { + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "PortalPin", + "contents": { "source_id": "74a1bfd0-4f2d-447b-945c-8d0cb67a2120" } + }, + { + "id": "e0b8f433-212f-48f6-ba4f-c8a194e1a707", + "name": "FocusDistance", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 5.0, + "min": 0.0, + "max": 1000.0, + "def": 5.0, + "step": 10.0, + "contents_type": "PortalPin", + "contents": { "source_id": "e709c7b4-9a59-4546-be53-0dc51abc5605" } + }, + { + "id": "68187c92-92f3-40d0-8b24-df6f33f9f649", + "name": "FocusRange", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 2.0, + "min": 0.01, + "max": 1000.0, + "def": 2.0, + "step": 9.9999, + "contents_type": "PortalPin", + "contents": { "source_id": "534a26e9-1ebd-4ed2-89fb-bdf5d34b6ec1" } + }, + { + "id": "42554a0a-2d70-4ec4-a2ea-594ad71559f3", + "name": "MaxRadius", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 16.0, + "min": 0.0, + "max": 128.0, + "def": 16.0, + "step": 1.28, + "contents_type": "PortalPin", + "contents": { "source_id": "63f77504-73aa-4b89-8849-65e27649b272" } + }, + { + "id": "312c4450-a4ad-4690-ba3d-afcbc93da6eb", + "name": "MinRadius", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 0.5, + "min": 0.0, + "max": 8.0, + "def": 0.5, + "step": 0.08, + "contents_type": "PortalPin", + "contents": { "source_id": "97561978-6da1-4a33-a6bc-c654008a8261" } + }, + { + "id": "ce6c0d45-8ce1-47ef-bd73-addda06d826e", + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "def": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "PortalPin", + "contents": { "source_id": "c278680b-43b5-40ce-b1af-a4551c2e58f0" } + }, + { + "id": "2be9d3ba-9386-43b0-ae1c-58168be2a289", + "name": "Input", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED", + "filtering": "LINEAR" + }, + "def": { + "filtering": "LINEAR" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "PortalPin", + "contents": { "source_id": "479248dc-200d-4a4d-87d2-f2c7c77f667f" } + } + ], + "pos": { "x": 0.0, "y": 0.0 }, + "contents_type": "Graph", + "contents": { "nodes": [ + { + "id": "393281e0-2cb8-4b90-a98e-a8e708719229", + "name": "Output", + "class_name": "nos.internal.GraphOutput", + "pins": [ + { + "id": "2e4ec877-e014-49ae-ae1d-881a0e4d1ac5", + "name": "Input", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_ONLY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "def": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "c278680b-43b5-40ce-b1af-a4551c2e58f0", + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "referred_by": [ + "ce6c0d45-8ce1-47ef-bd73-addda06d826e" + ], + "def": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" }, + { "key": "PinHidden", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 1329.0, "y": 1025.0 }, + "contents_type": "Job", + "contents": { "type": "" }, + "function_category": "Default Node", + "plugin_version": { "major": 0, "minor": 0, "patch": 0 } + }, + { + "id": "6a261add-ff1c-49ba-b9b7-a3bbad8e1fb3", + "name": "Directional DoF (1)", + "class_name": "nos.filters.DirectionalDof", + "pins": [ + { + "id": "b1b03fce-6863-42e6-a78a-260743b5441d", + "name": "Input", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET", + "filtering": "LINEAR" + }, + "def": { + "filtering": "LINEAR" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "14aab6c6-10ce-4a39-9c6f-8c5633fe59e2", + "name": "Depth", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED" + }, + "def": { + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "d6a91b7a-b576-487b-bd2c-89fee90a37d1", + "name": "FocusDistance", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 7.4, + "min": 0.0, + "max": 1000.0, + "def": 5.0, + "step": 10.0, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "55bffdc3-e0fa-4c0f-9ead-5a3b96c232bf", + "name": "FocusRange", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 3.1, + "min": 0.01, + "max": 1000.0, + "def": 2.0, + "step": 9.9999, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "bdc5ae5a-10cc-4c3c-b013-573a64bd8ec6", + "name": "MaxRadius", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 5.0, + "min": 0.0, + "max": 128.0, + "def": 16.0, + "step": 1.28, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "d77d3716-69f5-4c5d-a342-414dc11597fb", + "name": "MinRadius", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 0.0, + "min": 0.0, + "max": 8.0, + "def": 0.5, + "step": 0.08, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "ad25df82-1942-4f9f-a062-c072261a2d92", + "name": "BackgroundIsFar", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 1.0, + "min": 0.0, + "max": 1.0, + "def": 1.0, + "step": 0.01, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "aaff92e1-63fe-4253-8edb-1f34a76019c9", + "name": "Direction", + "type_name": "nos.fb.vec2", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { "x": 0.0, "y": 1.0 }, + "min": { "x": -1.0, "y": -1.0 }, + "max": { "x": 1.0, "y": 1.0 }, + "def": { "x": 1.0, "y": 0.0 }, + "step": 0.02, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "ad6603e0-2b1d-4bf6-a1d1-af0fc05978a2", + "name": "SampleCount", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 12.0, + "min": 1.0, + "max": 64.0, + "def": 12.0, + "step": 0.63, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "0ef0f439-9766-4957-8931-a02ce1019bd1", + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "def": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 1129.0, "y": 1073.0 }, + "contents_type": "Job", + "contents": { "type": "nos.sys.vulkan.GPUNode", "options": { "shader": "Shaders/DirectionalDof.frag" } }, + "function_category": "Default Node", + "description": "1D depth-aware blur. CoC is computed per pixel from a linear view-space Z input. Chain two instances along (1,0) and (0,1) for a separable disc bokeh.", + "plugin_version": { "major": 1, "minor": 7, "patch": 0 } + }, + { + "id": "deac982f-b51b-4ae0-b6c6-9b2998d3e5a9", + "name": "MaxRadius", + "class_name": "nos.internal.GraphInput", + "pins": [ + { + "id": "d5387b2e-f8c6-4b2e-8a42-a11eed779a1d", + "name": "Output", + "type_name": "float", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_ONLY", + "visualizer": { + }, + "data": 5.0, + "min": 0.0, + "max": 128.0, + "def": 16.0, + "step": 1.28, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "63f77504-73aa-4b89-8849-65e27649b272", + "name": "Input", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 16.0, + "referred_by": [ + "42554a0a-2d70-4ec4-a2ea-594ad71559f3" + ], + "min": 0.0, + "max": 128.0, + "def": 16.0, + "step": 1.28, + "meta_data_map": [ + { "key": "PinHidden", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 655.0, "y": 1250.0 }, + "contents_type": "Job", + "contents": { "type": "" }, + "function_category": "Default Node", + "plugin_version": { "major": 0, "minor": 0, "patch": 0 } + }, + { + "id": "af576b2d-dde0-4d7b-86fc-37cb9f97b49e", + "name": "Directional DoF", + "class_name": "nos.filters.DirectionalDof", + "pins": [ + { + "id": "9e368dde-bb31-44d8-aaad-782e92fe2366", + "name": "Input", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED", + "filtering": "LINEAR" + }, + "def": { + "filtering": "LINEAR" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "7e88a91a-1eca-4cc5-8dce-7c4aca61368d", + "name": "Depth", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED" + }, + "def": { + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "c1b814c4-e424-40a0-99d6-0437d948d1d7", + "name": "FocusDistance", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 7.4, + "min": 0.0, + "max": 1000.0, + "def": 5.0, + "step": 10.0, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "78893474-3dfc-4a36-b897-77760ba19c8c", + "name": "FocusRange", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 3.1, + "min": 0.01, + "max": 1000.0, + "def": 2.0, + "step": 9.9999, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "4f471215-bebf-49be-a6e4-909c394d1f1a", + "name": "MaxRadius", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 5.0, + "min": 0.0, + "max": 128.0, + "def": 16.0, + "step": 1.28, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "05132381-cf95-4253-9fc2-e87f84b70dd8", + "name": "MinRadius", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 0.0, + "min": 0.0, + "max": 8.0, + "def": 0.5, + "step": 0.08, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "2b52da00-b45d-41ae-a1ec-c88566879043", + "name": "BackgroundIsFar", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 1.0, + "min": 0.0, + "max": 1.0, + "def": 1.0, + "step": 0.01, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "ac684e31-1a90-462f-8b64-2b368a93b563", + "name": "Direction", + "type_name": "nos.fb.vec2", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { "x": 1.0, "y": 0.0 }, + "min": { "x": -1.0, "y": -1.0 }, + "max": { "x": 1.0, "y": 1.0 }, + "def": { "x": 1.0, "y": 0.0 }, + "step": 0.02, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "b8527f03-5c5c-4a41-b485-fa05e0f50cb1", + "name": "SampleCount", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 12.0, + "min": 1.0, + "max": 64.0, + "def": 12.0, + "step": 0.63, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "8f4d23a7-3b94-4a1c-ba14-d1ce47e92acd", + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "def": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED STORAGE RENDER_TARGET" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 855.0, "y": 977.0 }, + "contents_type": "Job", + "contents": { "type": "nos.sys.vulkan.GPUNode", "options": { "shader": "Shaders/DirectionalDof.frag" } }, + "function_category": "Default Node", + "description": "1D depth-aware blur. CoC is computed per pixel from a linear view-space Z input. Chain two instances along (1,0) and (0,1) for a separable disc bokeh.", + "plugin_version": { "major": 1, "minor": 7, "patch": 0 } + }, + { + "id": "8b497dab-5466-4d32-a440-125976e3a3ee", + "name": "Depth", + "class_name": "nos.internal.GraphInput", + "pins": [ + { + "id": "9587b7b1-8fc7-437b-9459-ee73f90de097", + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_ONLY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED" + }, + "def": { + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "74a1bfd0-4f2d-447b-945c-8d0cb67a2120", + "name": "Input", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED" + }, + "referred_by": [ + "1950c2e6-a0f6-485b-8a02-bded8a2f6ed5" + ], + "def": { + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" }, + { "key": "PinHidden", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 655.0, "y": 1025.0 }, + "contents_type": "Job", + "contents": { "type": "" }, + "function_category": "Default Node", + "plugin_version": { "major": 0, "minor": 0, "patch": 0 } + }, + { + "id": "9813ee9d-1f75-4554-9f9c-b9ecafc2e9fe", + "name": "FocusDistance", + "class_name": "nos.internal.GraphInput", + "pins": [ + { + "id": "7c60934b-ba19-4faf-9923-411511649cd0", + "name": "Output", + "type_name": "float", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_ONLY", + "visualizer": { + }, + "data": 7.4, + "min": 0.0, + "max": 1000.0, + "def": 5.0, + "step": 10.0, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "e709c7b4-9a59-4546-be53-0dc51abc5605", + "name": "Input", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 5.0, + "referred_by": [ + "e0b8f433-212f-48f6-ba4f-c8a194e1a707" + ], + "min": 0.0, + "max": 1000.0, + "def": 5.0, + "step": 10.0, + "meta_data_map": [ + { "key": "PinHidden", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 655.0, "y": 1100.0 }, + "contents_type": "Job", + "contents": { "type": "" }, + "function_category": "Default Node", + "plugin_version": { "major": 0, "minor": 0, "patch": 0 } + }, + { + "id": "2c0861b9-e416-4741-b56d-8dfa81c49516", + "name": "FocusRange", + "class_name": "nos.internal.GraphInput", + "pins": [ + { + "id": "6a933bab-7bf6-4388-b990-abd1b9729e64", + "name": "Output", + "type_name": "float", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_ONLY", + "visualizer": { + }, + "data": 3.1, + "min": 0.01, + "max": 1000.0, + "def": 2.0, + "step": 9.9999, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "534a26e9-1ebd-4ed2-89fb-bdf5d34b6ec1", + "name": "Input", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 2.0, + "referred_by": [ + "68187c92-92f3-40d0-8b24-df6f33f9f649" + ], + "min": 0.01, + "max": 1000.0, + "def": 2.0, + "step": 9.9999, + "meta_data_map": [ + { "key": "PinHidden", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 655.0, "y": 1175.0 }, + "contents_type": "Job", + "contents": { "type": "" }, + "function_category": "Default Node", + "plugin_version": { "major": 0, "minor": 0, "patch": 0 } + }, + { + "id": "3951aaae-16df-4b07-b1a9-b8b2a01b19c7", + "name": "MinRadius", + "class_name": "nos.internal.GraphInput", + "pins": [ + { + "id": "f0feee29-3782-49fe-a834-94e2b57916a8", + "name": "Output", + "type_name": "float", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_ONLY", + "visualizer": { + }, + "data": 0.0, + "min": 0.0, + "max": 8.0, + "def": 0.5, + "step": 0.08, + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "97561978-6da1-4a33-a6bc-c654008a8261", + "name": "Input", + "type_name": "float", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": 0.5, + "referred_by": [ + "312c4450-a4ad-4690-ba3d-afcbc93da6eb" + ], + "min": 0.0, + "max": 8.0, + "def": 0.5, + "step": 0.08, + "meta_data_map": [ + { "key": "PinHidden", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 655.0, "y": 1325.0 }, + "contents_type": "Job", + "contents": { "type": "" }, + "function_category": "Default Node", + "plugin_version": { "major": 0, "minor": 0, "patch": 0 } + }, + { + "id": "a1d16ddd-0144-4daa-97b2-e9b3b019c8c1", + "name": "Input", + "class_name": "nos.internal.GraphInput", + "pins": [ + { + "id": "c271ac23-2923-45c2-b262-b654455a93c3", + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_ONLY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED", + "filtering": "LINEAR" + }, + "def": { + "filtering": "LINEAR" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + }, + { + "id": "479248dc-200d-4a4d-87d2-f2c7c77f667f", + "name": "Input", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + }, + "data": { + "resolution": "HD", + "width": 1920, + "height": 1080, + "format": "R16G16B16A16_SFLOAT", + "usage": "TRANSFER_SRC TRANSFER_DST SAMPLED", + "filtering": "LINEAR" + }, + "referred_by": [ + "2be9d3ba-9386-43b0-ae1c-58168be2a289" + ], + "def": { + "filtering": "LINEAR" + }, + "advanced_property": true, + "meta_data_map": [ + { "key": "AdvancedProperty", "value": "true" }, + { "key": "PinHidden", "value": "true" } + ], + "contents_type": "JobPin", + "contents": { } + } + ], + "pos": { "x": 655.0, "y": 1400.0 }, + "contents_type": "Job", + "contents": { "type": "" }, + "function_category": "Default Node", + "plugin_version": { "major": 0, "minor": 0, "patch": 0 } + } + ], "connections": [ + { "from": "8f4d23a7-3b94-4a1c-ba14-d1ce47e92acd", "to": "b1b03fce-6863-42e6-a78a-260743b5441d", "id": "83839676-0760-4699-ae80-c0a789e273d8" }, + { "from": "f0feee29-3782-49fe-a834-94e2b57916a8", "to": "d77d3716-69f5-4c5d-a342-414dc11597fb", "id": "4c05135f-6001-4679-b39c-b248559ae56d" }, + { "from": "9587b7b1-8fc7-437b-9459-ee73f90de097", "to": "14aab6c6-10ce-4a39-9c6f-8c5633fe59e2", "id": "231cdfe5-7ac7-4013-9d20-68d5af8509b7" }, + { "from": "7c60934b-ba19-4faf-9923-411511649cd0", "to": "d6a91b7a-b576-487b-bd2c-89fee90a37d1", "id": "1cdaef73-876c-472a-97ff-04bf1f01348e" }, + { "from": "c271ac23-2923-45c2-b262-b654455a93c3", "to": "9e368dde-bb31-44d8-aaad-782e92fe2366", "id": "231fc88c-a52e-48d0-a6ee-8c2fdfe3ef0d" }, + { "from": "6a933bab-7bf6-4388-b990-abd1b9729e64", "to": "55bffdc3-e0fa-4c0f-9ead-5a3b96c232bf", "id": "7c1cae59-5834-420e-9d3d-e4767f6c3273" }, + { "from": "d5387b2e-f8c6-4b2e-8a42-a11eed779a1d", "to": "bdc5ae5a-10cc-4c3c-b013-573a64bd8ec6", "id": "d74bdb3a-8c8c-4f82-8038-01a237e27a89" }, + { "from": "0ef0f439-9766-4957-8931-a02ce1019bd1", "to": "2e4ec877-e014-49ae-ae1d-881a0e4d1ac5", "id": "353cc954-d098-417a-8331-357b879ba654" }, + { "from": "9587b7b1-8fc7-437b-9459-ee73f90de097", "to": "7e88a91a-1eca-4cc5-8dce-7c4aca61368d", "id": "b126f4c4-d748-46f2-be51-ce1c778c0c4b" }, + { "from": "7c60934b-ba19-4faf-9923-411511649cd0", "to": "c1b814c4-e424-40a0-99d6-0437d948d1d7", "id": "fc25a2f4-0af4-49ae-9052-133a76cfc044" }, + { "from": "6a933bab-7bf6-4388-b990-abd1b9729e64", "to": "78893474-3dfc-4a36-b897-77760ba19c8c", "id": "f6ba18f8-0ef1-42db-a774-c4b02aa78fac" }, + { "from": "d5387b2e-f8c6-4b2e-8a42-a11eed779a1d", "to": "4f471215-bebf-49be-a6e4-909c394d1f1a", "id": "afd9d7ff-f9e2-4a67-b874-2cfb2f870447" }, + { "from": "f0feee29-3782-49fe-a834-94e2b57916a8", "to": "05132381-cf95-4253-9fc2-e87f84b70dd8", "id": "82919455-4a51-490a-8ab2-201952d2e126" } + ] }, + "function_category": "Default Node", + "display_name": "Depth of Field", + "plugin_version": { "major": 0, "minor": 0, "patch": 0 } + } + } + ] } diff --git a/Plugins/nosFilters/Config/DirectionalDof.nosdef b/Plugins/nosFilters/Config/DirectionalDof.nosdef new file mode 100644 index 00000000..427e4385 --- /dev/null +++ b/Plugins/nosFilters/Config/DirectionalDof.nosdef @@ -0,0 +1,121 @@ +{ + "nodes": [ + { + "class_name": "DirectionalDof", + "menu_info": { + "category": "Filters", + "display_name": "Directional DoF" + }, + "node": { + "class_name": "DirectionalDof", + "name": "Directional DoF", + "description": "1D depth-aware blur. CoC is computed per pixel from a linear view-space Z input. Chain two instances along (1,0) and (0,1) for a separable disc bokeh.", + "contents_type": "Job", + "contents": { + "type": "nos.sys.vulkan.GPUNode", + "options": { + "shader": "Shaders/DirectionalDof.frag" + } + }, + "pins": [ + { + "name": "Input", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": { + "filtering": "LINEAR" + } + }, + { + "name": "Depth", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "INPUT_PIN", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": { + "filtering": "NEAREST" + } + }, + { + "name": "FocusDistance", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 5.0, + "min": 0.0, + "max": 1000.0 + }, + { + "name": "FocusRange", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 2.0, + "min": 0.01, + "max": 1000.0 + }, + { + "name": "MaxRadius", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 16.0, + "min": 0.0, + "max": 128.0 + }, + { + "name": "MinRadius", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 0.5, + "min": 0.0, + "max": 8.0 + }, + { + "name": "BackgroundIsFar", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 1.0, + "min": 0.0, + "max": 1.0 + }, + { + "name": "Direction", + "type_name": "nos.fb.vec2", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": { + "x": 1.0, + "y": 0.0 + }, + "min": { + "x": -1.0, + "y": -1.0 + }, + "max": { + "x": 1.0, + "y": 1.0 + } + }, + { + "name": "SampleCount", + "type_name": "float", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": 12.0, + "min": 1.0, + "max": 64.0 + }, + { + "name": "Output", + "type_name": "nos.sys.vulkan.Texture", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_OR_PROPERTY" + } + ] + } + } + ] +} diff --git a/Plugins/nosFilters/Filters.noscfg b/Plugins/nosFilters/Filters.noscfg index 072c6ce0..3660bb3f 100644 --- a/Plugins/nosFilters/Filters.noscfg +++ b/Plugins/nosFilters/Filters.noscfg @@ -2,7 +2,7 @@ "info": { "id": { "name": "nos.filters", - "version": "1.7.0" + "version": "1.8.0" }, "display_name": "Filters", "description": "Collection of image filters.", @@ -27,6 +27,10 @@ "Config/Diff.nosdef", "Config/GaussianBlur.nosdef", "Config/DirectionalBlur.nosdef", + "Config/DirectionalDof.nosdef", + "Config/DepthOfField.nosdef", + "Config/BokehDof.nosdef", + "Config/BokehShape.nosdef", "Config/KawaseLightStreak.nosdef", "Config/Kuwahara.nosdef", "Config/PremultiplyAlpha.nosdef", diff --git a/Plugins/nosFilters/Shaders/BokehDof.frag b/Plugins/nosFilters/Shaders/BokehDof.frag new file mode 100644 index 00000000..b365ddf2 --- /dev/null +++ b/Plugins/nosFilters/Shaders/BokehDof.frag @@ -0,0 +1,105 @@ +// Copyright MediaZ Teknoloji A.S. All Rights Reserved. +// Single-pass 2D bokeh depth-of-field with a kernel-texture shaping the bokeh. +// +// Computes a per-pixel circle of confusion (CoC) from a linear view-space Z +// input, then gathers samples on a Vogel (golden-angle) disc within that CoC. +// Each sample's contribution is weighted by BokehShape sampled at the same +// unit-disc position, so the bokeh takes on the shape painted into BokehShape +// (regular polygon, ring, custom artwork, etc.). + +#version 450 + +#define MASK_THRESHOLD 0.001 +#define GOLDEN_ANGLE 2.39996322972865332 + +layout(binding = 0) uniform sampler2D Input; +layout(binding = 1) uniform sampler2D Depth; +layout(binding = 2) uniform sampler2D BokehShape; +layout(binding = 3) uniform BokehDofParams +{ + // Focus distance in the same units as the Depth input (linear view-space Z). + float FocusDistance; + // Distance from focus where CoC reaches MaxRadius. + float FocusRange; + // Maximum CoC radius in pixels. + float MaxRadius; + // Skip the gather when CoC <= MinRadius (keeps focused regions crisp & cheap). + float MinRadius; + // 0 = treat zero depth as "near focus" (stays sharp); 1 = treat as far plane. + float BackgroundIsFar; + // Total Vogel-disc sample count. ~32 = soft, ~64 = clean, ~128 = no banding. + float SampleCount; + // Rotate the kernel lookup (radians). Useful for animated highlights. + float KernelRotation; +} +Params; + +layout(location = 0) out vec4 rt; +layout(location = 0) in vec2 uv; + +float CocFromDepth(float Z) +{ + if (Z <= 0.0) + Z = mix(Params.FocusDistance, Params.FocusDistance + Params.FocusRange * 4.0, Params.BackgroundIsFar); + + float D = abs(Z - Params.FocusDistance); + float Coc = D / max(Params.FocusRange, 1e-4); + return clamp(Coc * Params.MaxRadius, 0.0, Params.MaxRadius); +} + +void main() +{ + vec2 TextureSize = textureSize(Input, 0); + vec2 TexelSize = 1.0 / TextureSize; + + vec4 CenterColor = texture(Input, uv); + float CenterZ = texture(Depth, uv).r; + float CenterCoC = CocFromDepth(CenterZ); + + if (CenterCoC <= Params.MinRadius || Params.MaxRadius < MASK_THRESHOLD) + { + rt = CenterColor; + return; + } + + int N = int(max(1.0, Params.SampleCount)); + float CosR = cos(Params.KernelRotation); + float SinR = sin(Params.KernelRotation); + + // Vogel disc: golden-angle spiral with sqrt radius for uniform area density. + // Sample 0 is the center; included implicitly via CenterColor initialization. + vec4 Accum = CenterColor; + float Weight = texture(BokehShape, vec2(0.5)).r; + Accum *= Weight; + + for (int i = 1; i < N; ++i) + { + float Frac = float(i) / float(N); + float R = sqrt(Frac); // unit-disc radius + float Th = float(i) * GOLDEN_ANGLE; + vec2 Unit = vec2(cos(Th) * R, sin(Th) * R); // unit disc position + + // Rotated lookup into the bokeh kernel. + vec2 ShapeUv = vec2(Unit.x * CosR - Unit.y * SinR, + Unit.x * SinR + Unit.y * CosR) * 0.5 + 0.5; + float WShape = texture(BokehShape, ShapeUv).r; + if (WShape <= MASK_THRESHOLD) + continue; + + vec2 Ofs = Unit * CenterCoC * TexelSize; + vec4 Sample = texture(Input, uv + Ofs); + float ZSamp = texture(Depth, uv + Ofs).r; + float CocSmp = CocFromDepth(ZSamp); + + // Per-sample CoC rejection prevents in-focus pixels bleeding outward. + // A sample contributes only if its own CoC is at least its distance from center. + float Dist = R * CenterCoC; + float WCoc = Dist <= CocSmp ? 1.0 : 0.0; + + float W = WShape * WCoc; + Accum += Sample * W; + Weight += W; + } + + rt = Accum / max(Weight, 1e-4); +} diff --git a/Plugins/nosFilters/Shaders/BokehShape.frag b/Plugins/nosFilters/Shaders/BokehShape.frag new file mode 100644 index 00000000..cb963629 --- /dev/null +++ b/Plugins/nosFilters/Shaders/BokehShape.frag @@ -0,0 +1,77 @@ +// Copyright MediaZ Teknoloji A.S. All Rights Reserved. +// Procedural bokeh kernel generator. +// +// Produces a grayscale unit-disc mask shaped like a regular polygon aperture +// (number of blades configurable) with optional roundness, rotation, soft edge +// and brightened rim. Intended as input to a kernel-weighted DoF gather. +// +// Convention: image is treated as the [-1, 1] unit square; pixels outside the +// kernel shape return 0; pixels inside return ~1, with a smooth edge falloff +// over EdgeSoftness. The mask is normalized so that center stays at 1. + +#version 450 + +#define PI 3.14159265358979323846 + +layout(location = 0) out vec4 rt; +layout(location = 0) in vec2 uv; + +layout(binding = 1) uniform BokehShapeParams +{ + // Aperture blade count. 0 or 1 = perfect circle. + float BladeCount; + // 0 = sharp polygon, 1 = perfect circle. Interpolates polygon edge toward disc. + float Roundness; + // Rotation of the polygon (radians). + float Rotation; + // Soft falloff width at the edge, in [0, 1] of unit-disc radius. + float EdgeSoftness; + // Extra brightness boost near the rim, [0, 1]. Mimics cat's-eye / specular bokeh. + float RimBoost; + // Width of the rim brightening band, in [0, 1] of radius. + float RimWidth; +} +Params; + +void main() +{ + // Map uv [0,1] to centered coords [-1,1] + vec2 Pos = uv * 2.0 - 1.0; + float R = length(Pos); + + if (R > 1.0) + { + rt = vec4(0.0); + return; + } + + float Blades = max(Params.BladeCount, 1.0); + + // Polygon edge radius along this angular direction. + // sectorAngle = 2*pi / N; angle from sector center is a; edge distance = cos(pi/N) / cos(a). + float PolygonR = 1.0; + if (Blades >= 3.0) + { + float Theta = atan(Pos.y, Pos.x) - Params.Rotation; + float SectorAngle = 2.0 * PI / Blades; + float HalfSector = SectorAngle * 0.5; + // Angle measured from the nearest sector centerline, in [-HalfSector, +HalfSector]. + float A = mod(Theta + HalfSector, SectorAngle) - HalfSector; + PolygonR = cos(HalfSector) / max(cos(A), 1e-4); + } + + // Roundness mixes polygon edge toward the circumscribed circle (radius 1). + float EdgeR = mix(PolygonR, 1.0, clamp(Params.Roundness, 0.0, 1.0)); + + // Soft edge: 1 inside, 0 past the edge, smooth across EdgeSoftness. + float Soft = max(Params.EdgeSoftness, 1e-4); + float Mask = 1.0 - smoothstep(EdgeR - Soft, EdgeR, R); + + // Rim brightening: a soft band just inside the edge. + float RimW = max(Params.RimWidth, 1e-4); + float RimPos = (R - (EdgeR - RimW)) / RimW; // 0 at inner edge of rim, 1 at outer + float Rim = clamp(1.0 - abs(RimPos * 2.0 - 1.0), 0.0, 1.0); + Mask += Rim * Params.RimBoost * Mask; + + rt = vec4(Mask, Mask, Mask, 1.0); +} diff --git a/Plugins/nosFilters/Shaders/DirectionalDof.frag b/Plugins/nosFilters/Shaders/DirectionalDof.frag new file mode 100644 index 00000000..308cfcc7 --- /dev/null +++ b/Plugins/nosFilters/Shaders/DirectionalDof.frag @@ -0,0 +1,95 @@ +// Copyright MediaZ Teknoloji A.S. All Rights Reserved. +// Directional depth-of-field pass. +// Computes circle-of-confusion (CoC) per pixel from a linear view-space Z input, +// then does a 1D weighted gather along Direction. Chain two instances +// (Direction = (1,0) and Direction = (0,1)) for a separable approximation of +// disc bokeh; visually close to a gaussian bokeh and cheap. + +#version 450 + +#define MASK_THRESHOLD 0.001 + +layout(binding = 0) uniform sampler2D Input; +layout(binding = 1) uniform sampler2D Depth; +layout(binding = 2) uniform DirectionalDofParams +{ + // Focus distance in the same units as the Depth input (linear view-space Z). + float FocusDistance; + // Distance from focus where CoC reaches MaxRadius. + // Smaller value = sharper focus falloff; larger = gentler. + float FocusRange; + // Maximum CoC radius in pixels. + float MaxRadius; + // 0 = treat zero depth as "no info, keep sharp"; 1 = treat zero depth as far. + float BackgroundIsFar; + vec2 Direction; + // Optional: clamp CoC near the focus plane to avoid noise; raise to skip tiny blurs. + float MinRadius; + // Sample count along the direction (one side; total taps = 2*N+1). Higher = smoother. + float SampleCount; +} +Params; + +layout(location = 0) out vec4 rt; +layout(location = 0) in vec2 uv; + +float CocFromDepth(float Z) +{ + // Treat Z<=0 (no depth signal) as either "near focus" (BackgroundIsFar=0) + // or as far plane (BackgroundIsFar=1). Picking far avoids halos around empty regions. + if (Z <= 0.0) + Z = mix(Params.FocusDistance, Params.FocusDistance + Params.FocusRange * 4.0, Params.BackgroundIsFar); + + float D = abs(Z - Params.FocusDistance); + float Coc = D / max(Params.FocusRange, 1e-4); + Coc = clamp(Coc * Params.MaxRadius, 0.0, Params.MaxRadius); + return Coc; +} + +void main() +{ + vec2 TextureSize = textureSize(Input, 0); + vec2 TexelSize = 1.0 / TextureSize; + + vec4 CenterColor = texture(Input, uv); + float CenterZ = texture(Depth, uv).r; + float CenterCoC = CocFromDepth(CenterZ); + + if (CenterCoC <= Params.MinRadius || Params.MaxRadius < MASK_THRESHOLD) + { + rt = CenterColor; + return; + } + + vec2 Dir = normalize(Params.Direction); + + int N = int(max(1.0, Params.SampleCount)); + float RadiusPx = CenterCoC; + float Step = RadiusPx / float(N); + + // Box-weighted average; for separable-2D this gives a soft disc. + // CoC-clamping per sample prevents fragments in focus from bleeding outward. + vec4 Accum = CenterColor; + float Weight = 1.0; + + for (int i = 1; i <= N; ++i) + { + float T = float(i) * Step; + vec2 Ofs = Dir * T * TexelSize; + + vec4 SPos = texture(Input, uv + Ofs); + float ZPos = texture(Depth, uv + Ofs).r; + float CocPos = CocFromDepth(ZPos); + float WPos = Step <= CocPos ? 1.0 : 0.0; + + vec4 SNeg = texture(Input, uv - Ofs); + float ZNeg = texture(Depth, uv - Ofs).r; + float CocNeg = CocFromDepth(ZNeg); + float WNeg = Step <= CocNeg ? 1.0 : 0.0; + + Accum += SPos * WPos + SNeg * WNeg; + Weight += WPos + WNeg; + } + + rt = Accum / Weight; +} diff --git a/Plugins/nosGeometry/CMakeLists.txt b/Plugins/nosGeometry/CMakeLists.txt new file mode 100644 index 00000000..82e8b4ed --- /dev/null +++ b/Plugins/nosGeometry/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright MediaZ Teknoloji A.S. All Rights Reserved. + +# Vendored OpenFBX, used to read object transforms out of .fbx files. +# Built under a plugin-unique target name so it never clashes with the +# "openFBX" target the zd plugins create from their own copy. +if (NOT TARGET nosGeometry_openFBX) + add_library(nosGeometry_openFBX STATIC + External/openFBX/libdeflate.c + External/openFBX/ofbx.cpp) + target_include_directories(nosGeometry_openFBX PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/External/openFBX") + nos_group_targets("nosGeometry_openFBX" "External") +endif() + +# nos.sys.track provides the CoordinateFrame type and the shared +# CoordinateFrameConv.h helpers used to express FBX transforms in a frame +# convention that composes with nosTrack's Convert/Track Transform nodes. +set(MODULE_DEPENDENCIES "nos.sys.track-1.1") +set(dep_idx 0) +foreach(module_name_version ${MODULE_DEPENDENCIES}) + string(REPLACE "-" ";" module_name_version ${module_name_version}) + list(GET module_name_version 0 module_name) + list(GET module_name_version 1 module_version) + nos_get_module("${module_name}" "${module_version}" DEP_${dep_idx}) + list(APPEND MODULE_DEPENDENCIES_TARGETS ${DEP_${dep_idx}}) +endforeach() + +set(DEPENDENCIES ${NOS_PLUGIN_SDK_TARGET} nosGeometry_openFBX ${MODULE_DEPENDENCIES_TARGETS}) +set(INCLUDE_FOLDERS "") + +nos_add_plugin("nosGeometry" "${DEPENDENCIES}" "${INCLUDE_FOLDERS}") + +# Helpers need C++20 +set_target_properties("nosGeometry" PROPERTIES CXX_STANDARD 20) diff --git a/Plugins/nosGeometry/Config/ReadFBXTransform.nosdef b/Plugins/nosGeometry/Config/ReadFBXTransform.nosdef new file mode 100644 index 00000000..8950b04b --- /dev/null +++ b/Plugins/nosGeometry/Config/ReadFBXTransform.nosdef @@ -0,0 +1,78 @@ +{ + "nodes": [ + { + "class_name": "ReadFBXTransform", + "menu_info": { + "category": "Geometry", + "display_name": "Read FBX Transform", + "name_aliases": [ "FBX", "Load FBX", "FBX Transform", "FBX Reader" ] + }, + "node": { + "class_name": "ReadFBXTransform", + "contents_type": "Job", + "description": "Reads an .fbx file and outputs the local and global transform of a\nselected object inside it. Pick the object from the 'Object' dropdown,\nwhich is populated with the names found in the file.", + "pins": [ + { + "name": "Path", + "type_name": "string", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "visualizer": { + "type": "FILE_PICKER", + "file_extensions": [ "fbx" ], + "file_picker_type": "OPEN" + }, + "description": "Path to the .fbx file to read." + }, + { + "name": "Object", + "type_name": "string", + "show_as": "PROPERTY", + "can_show_as": "PROPERTY_ONLY", + "data": "", + "visualizer": { + "type": "COMBO_BOX", + "name": "" + }, + "description": "Object inside the .fbx whose transform is reported.\nThe list is populated once the file is loaded." + }, + { + "name": "SourceFrame", + "display_name": "Source Frame", + "type_name": "nos.sys.track.CoordinateFrame", + "show_as": "PROPERTY", + "can_show_as": "INPUT_PIN_OR_PROPERTY", + "data": "RH_YUp_FwdNegZ_RightX", + "description": "Coordinate frame the .fbx is authored in. The output transforms are expressed in this frame; set Convert Transform's 'SourceFrame' to the same value." + }, + { + "name": "LocalTransform", + "display_name": "Local Transform", + "type_name": "nos.fb.Transform", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_ONLY", + "description": "Transform of the object relative to its parent." + }, + { + "name": "GlobalTransform", + "display_name": "Global Transform", + "type_name": "nos.fb.Transform", + "show_as": "OUTPUT_PIN", + "can_show_as": "OUTPUT_PIN_ONLY", + "description": "World transform of the object, accounting for its parent hierarchy." + }, + { + "name": "IsLoaded", + "display_name": "Is Loaded", + "type_name": "bool", + "show_as": "PROPERTY", + "can_show_as": "OUTPUT_PIN_OR_PROPERTY", + "data": false, + "readonly": true, + "description": "True if a valid 'Path' was selected and the file could be loaded." + } + ] + } + } + ] +} diff --git a/Plugins/nosGeometry/External/openFBX/libdeflate.c b/Plugins/nosGeometry/External/openFBX/libdeflate.c new file mode 100644 index 00000000..e421d791 --- /dev/null +++ b/Plugins/nosGeometry/External/openFBX/libdeflate.c @@ -0,0 +1,4193 @@ +// ofbx changes : removed unused code, single .h and .c +/* + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * + * This is a highly optimized DEFLATE decompressor. It is much faster than + * vanilla zlib, typically well over twice as fast, though results vary by CPU. + * + * Why this is faster than vanilla zlib: + * + * - Word accesses rather than byte accesses when reading input + * - Word accesses rather than byte accesses when copying matches + * - Faster Huffman decoding combined with various DEFLATE-specific tricks + * - Larger bitbuffer variable that doesn't need to be refilled as often + * - Other optimizations to remove unnecessary branches + * - Only full-buffer decompression is supported, so the code doesn't need to + * support stopping and resuming decompression. + * - On x86_64, a version of the decompression routine is compiled with BMI2 + * instructions enabled and is used automatically at runtime when supported. + */ + +/* + * lib_common.h - internal header included by all library code + */ + +#ifndef LIB_LIB_COMMON_H +#define LIB_LIB_COMMON_H + +#ifdef LIBDEFLATE_H + /* + * When building the library, LIBDEFLATEAPI needs to be defined properly before + * including libdeflate.h. + */ +# error "lib_common.h must always be included before libdeflate.h" +#endif + +#if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBDEFLATE_EXPORT_SYM __declspec(dllexport) +#elif defined(__GNUC__) +# define LIBDEFLATE_EXPORT_SYM __attribute__((visibility("default"))) +#else +# define LIBDEFLATE_EXPORT_SYM +#endif + +/* + * On i386, gcc assumes that the stack is 16-byte aligned at function entry. + * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi) + * only guarantee 4-byte alignment when calling functions. This is mainly an + * issue on Windows, but it has been seen on Linux too. Work around this ABI + * incompatibility by realigning the stack pointer when entering libdeflate. + * This prevents crashes in SSE/AVX code. + */ +#if defined(__GNUC__) && defined(__i386__) +# define LIBDEFLATE_ALIGN_STACK __attribute__((force_align_arg_pointer)) +#else +# define LIBDEFLATE_ALIGN_STACK +#endif + +#define LIBDEFLATEAPI LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK + +/* + * common_defs.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef COMMON_DEFS_H +#define COMMON_DEFS_H + +#include "libdeflate.h" + +#include +#include /* for size_t */ +#include +#ifdef _MSC_VER +# include /* for _BitScan*() and other intrinsics */ +# include /* for _byteswap_*() */ + /* Disable MSVC warnings that are expected. */ + /* /W2 */ +# pragma warning(disable : 4146) /* unary minus on unsigned type */ + /* /W3 */ +# pragma warning(disable : 4018) /* signed/unsigned mismatch */ +# pragma warning(disable : 4244) /* possible loss of data */ +# pragma warning(disable : 4267) /* possible loss of precision */ +# pragma warning(disable : 4310) /* cast truncates constant value */ + /* /W4 */ +# pragma warning(disable : 4100) /* unreferenced formal parameter */ +# pragma warning(disable : 4127) /* conditional expression is constant */ +# pragma warning(disable : 4189) /* local variable initialized but not referenced */ +# pragma warning(disable : 4232) /* nonstandard extension used */ +# pragma warning(disable : 4245) /* conversion from 'int' to 'unsigned int' */ +# pragma warning(disable : 4295) /* array too small to include terminating null */ +#endif +#ifndef FREESTANDING +# include /* for memcpy() */ +#endif + +/* ========================================================================== */ +/* Target architecture */ +/* ========================================================================== */ + +/* If possible, define a compiler-independent ARCH_* macro. */ +#undef ARCH_X86_64 +#undef ARCH_X86_32 +#undef ARCH_ARM64 +#undef ARCH_ARM32 +#ifdef _MSC_VER +# if defined(_M_X64) +# define ARCH_X86_64 +# elif defined(_M_IX86) +# define ARCH_X86_32 +# elif defined(_M_ARM64) +# define ARCH_ARM64 +# elif defined(_M_ARM) +# define ARCH_ARM32 +# endif +#else +# if defined(__x86_64__) +# define ARCH_X86_64 +# elif defined(__i386__) +# define ARCH_X86_32 +# elif defined(__aarch64__) +# define ARCH_ARM64 +# elif defined(__arm__) +# define ARCH_ARM32 +# endif +#endif + +/* ========================================================================== */ +/* Type definitions */ +/* ========================================================================== */ + +/* Fixed-width integer types */ +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +/* ssize_t, if not available in */ +#ifdef _MSC_VER +# ifdef _WIN64 + typedef long long ssize_t; +# else + typedef long ssize_t; +# endif +#endif + +/* + * Word type of the target architecture. Use 'size_t' instead of + * 'unsigned long' to account for platforms such as Windows that use 32-bit + * 'unsigned long' on 64-bit architectures. + */ +typedef size_t machine_word_t; + +/* Number of bytes in a word */ +#define WORDBYTES ((int)sizeof(machine_word_t)) + +/* Number of bits in a word */ +#define WORDBITS (8 * WORDBYTES) + +/* ========================================================================== */ +/* Optional compiler features */ +/* ========================================================================== */ + +/* Compiler version checks. Only use when absolutely necessary. */ +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) +# define GCC_PREREQ(major, minor) \ + (__GNUC__ > (major) || \ + (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +#else +# define GCC_PREREQ(major, minor) 0 +#endif +#ifdef __clang__ +# ifdef __apple_build_version__ +# define CLANG_PREREQ(major, minor, apple_version) \ + (__apple_build_version__ >= (apple_version)) +# else +# define CLANG_PREREQ(major, minor, apple_version) \ + (__clang_major__ > (major) || \ + (__clang_major__ == (major) && __clang_minor__ >= (minor))) +# endif +#else +# define CLANG_PREREQ(major, minor, apple_version) 0 +#endif + +/* + * Macros to check for compiler support for attributes and builtins. clang + * implements these macros, but gcc doesn't, so generally any use of one of + * these macros must also be combined with a gcc version check. + */ +#ifndef __has_attribute +# define __has_attribute(attribute) 0 +#endif +#ifndef __has_builtin +# define __has_builtin(builtin) 0 +#endif + +/* inline - suggest that a function be inlined */ +#ifdef _MSC_VER +# define inline __inline +#endif /* else assume 'inline' is usable as-is */ + +/* forceinline - force a function to be inlined, if possible */ +#if defined(__GNUC__) || __has_attribute(always_inline) +# define forceinline inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define forceinline __forceinline +#else +# define forceinline inline +#endif + +/* MAYBE_UNUSED - mark a function or variable as maybe unused */ +#if defined(__GNUC__) || __has_attribute(unused) +# define MAYBE_UNUSED __attribute__((unused)) +#else +# define MAYBE_UNUSED +#endif + +/* + * restrict - hint that writes only occur through the given pointer. + * + * Don't use MSVC's __restrict, since it has nonstandard behavior. + * Standard restrict is okay, if it is supported. + */ +#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L) +# if defined(__GNUC__) || defined(__clang__) +# define restrict __restrict__ +# else +# define restrict +# endif +#endif /* else assume 'restrict' is usable as-is */ + +/* likely(expr) - hint that an expression is usually true */ +#if defined(__GNUC__) || __has_builtin(__builtin_expect) +# define likely(expr) __builtin_expect(!!(expr), 1) +#else +# define likely(expr) (expr) +#endif + +/* unlikely(expr) - hint that an expression is usually false */ +#if defined(__GNUC__) || __has_builtin(__builtin_expect) +# define unlikely(expr) __builtin_expect(!!(expr), 0) +#else +# define unlikely(expr) (expr) +#endif + +/* prefetchr(addr) - prefetch into L1 cache for read */ +#undef prefetchr +#if defined(__GNUC__) || __has_builtin(__builtin_prefetch) +# define prefetchr(addr) __builtin_prefetch((addr), 0) +#elif defined(_MSC_VER) +# if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# define prefetchr(addr) _mm_prefetch((addr), _MM_HINT_T0) +# elif defined(ARCH_ARM64) +# define prefetchr(addr) __prefetch2((addr), 0x00 /* prfop=PLDL1KEEP */) +# elif defined(ARCH_ARM32) +# define prefetchr(addr) __prefetch(addr) +# endif +#endif +#ifndef prefetchr +# define prefetchr(addr) +#endif + +/* prefetchw(addr) - prefetch into L1 cache for write */ +#undef prefetchw +#if defined(__GNUC__) || __has_builtin(__builtin_prefetch) +# define prefetchw(addr) __builtin_prefetch((addr), 1) +#elif defined(_MSC_VER) +# if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# define prefetchw(addr) _m_prefetchw(addr) +# elif defined(ARCH_ARM64) +# define prefetchw(addr) __prefetch2((addr), 0x10 /* prfop=PSTL1KEEP */) +# elif defined(ARCH_ARM32) +# define prefetchw(addr) __prefetchw(addr) +# endif +#endif +#ifndef prefetchw +# define prefetchw(addr) +#endif + +/* + * _aligned_attribute(n) - declare that the annotated variable, or variables of + * the annotated type, must be aligned on n-byte boundaries. + */ +#undef _aligned_attribute +#if defined(__GNUC__) || __has_attribute(aligned) +# define _aligned_attribute(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) +# define _aligned_attribute(n) __declspec(align(n)) +#endif + +/* + * _target_attribute(attrs) - override the compilation target for a function. + * + * This accepts one or more comma-separated suffixes to the -m prefix jointly + * forming the name of a machine-dependent option. On gcc-like compilers, this + * enables codegen for the given targets, including arbitrary compiler-generated + * code as well as the corresponding intrinsics. On other compilers this macro + * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway. + */ +#if GCC_PREREQ(4, 4) || __has_attribute(target) +# define _target_attribute(attrs) __attribute__((target(attrs))) +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 1 +#else +# define _target_attribute(attrs) +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 +#endif + +/* ========================================================================== */ +/* Miscellaneous macros */ +/* ========================================================================== */ + +#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0])) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)])) +#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1)) +#define ROUND_UP(n, d) ((d) * DIV_ROUND_UP((n), (d))) + +/* ========================================================================== */ +/* Endianness handling */ +/* ========================================================================== */ + +/* + * CPU_IS_LITTLE_ENDIAN() - 1 if the CPU is little endian, or 0 if it is big + * endian. When possible this is a compile-time macro that can be used in + * preprocessor conditionals. As a fallback, a generic method is used that + * can't be used in preprocessor conditionals but should still be optimized out. + */ +#if defined(__BYTE_ORDER__) /* gcc v4.6+ and clang */ +# define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#elif defined(_MSC_VER) +# define CPU_IS_LITTLE_ENDIAN() true +#else +static forceinline bool CPU_IS_LITTLE_ENDIAN(void) +{ + union { + u32 w; + u8 b; + } u; + + u.w = 1; + return u.b; +} +#endif + +/* bswap16(v) - swap the bytes of a 16-bit integer */ +static forceinline u16 bswap16(u16 v) +{ +#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) + return __builtin_bswap16(v); +#elif defined(_MSC_VER) + return _byteswap_ushort(v); +#else + return (v << 8) | (v >> 8); +#endif +} + +/* bswap32(v) - swap the bytes of a 32-bit integer */ +static forceinline u32 bswap32(u32 v) +{ +#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) + return _byteswap_ulong(v); +#else + return ((v & 0x000000FF) << 24) | + ((v & 0x0000FF00) << 8) | + ((v & 0x00FF0000) >> 8) | + ((v & 0xFF000000) >> 24); +#endif +} + +/* bswap64(v) - swap the bytes of a 64-bit integer */ +static forceinline u64 bswap64(u64 v) +{ +#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) + return _byteswap_uint64(v); +#else + return ((v & 0x00000000000000FF) << 56) | + ((v & 0x000000000000FF00) << 40) | + ((v & 0x0000000000FF0000) << 24) | + ((v & 0x00000000FF000000) << 8) | + ((v & 0x000000FF00000000) >> 8) | + ((v & 0x0000FF0000000000) >> 24) | + ((v & 0x00FF000000000000) >> 40) | + ((v & 0xFF00000000000000) >> 56); +#endif +} + +#define le16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap16(v)) +#define le32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap32(v)) +#define le64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap64(v)) +#define be16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap16(v) : (v)) +#define be32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap32(v) : (v)) +#define be64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap64(v) : (v)) + +/* ========================================================================== */ +/* Unaligned memory accesses */ +/* ========================================================================== */ + +/* + * UNALIGNED_ACCESS_IS_FAST() - 1 if unaligned memory accesses can be performed + * efficiently on the target platform, otherwise 0. + */ +#if (defined(__GNUC__) || defined(__clang__)) && \ + (defined(ARCH_X86_64) || defined(ARCH_X86_32) || \ + defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \ + /* + * For all compilation purposes, WebAssembly behaves like any other CPU + * instruction set. Even though WebAssembly engine might be running on + * top of different actual CPU architectures, the WebAssembly spec + * itself permits unaligned access and it will be fast on most of those + * platforms, and simulated at the engine level on others, so it's + * worth treating it as a CPU architecture with fast unaligned access. + */ defined(__wasm__)) +# define UNALIGNED_ACCESS_IS_FAST 1 +#elif defined(_MSC_VER) +# define UNALIGNED_ACCESS_IS_FAST 1 +#else +# define UNALIGNED_ACCESS_IS_FAST 0 +#endif + +/* + * Implementing unaligned memory accesses using memcpy() is portable, and it + * usually gets optimized appropriately by modern compilers. I.e., each + * memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled to a load or store + * instruction, not to an actual function call. + * + * We no longer use the "packed struct" approach to unaligned accesses, as that + * is nonstandard, has unclear semantics, and doesn't receive enough testing + * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994). + * + * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception + * where memcpy() generates inefficient code + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366). However, we no longer + * consider that one case important enough to maintain different code for. + * If you run into it, please just use a newer version of gcc (or use clang). + */ + +#ifdef FREESTANDING +# define MEMCOPY __builtin_memcpy +#else +# define MEMCOPY memcpy +#endif + +/* Unaligned loads and stores without endianness conversion */ + +#define DEFINE_UNALIGNED_TYPE(type) \ +static forceinline type \ +load_##type##_unaligned(const void *p) \ +{ \ + type v; \ + \ + MEMCOPY(&v, p, sizeof(v)); \ + return v; \ +} \ + \ +static forceinline void \ +store_##type##_unaligned(type v, void *p) \ +{ \ + MEMCOPY(p, &v, sizeof(v)); \ +} + +DEFINE_UNALIGNED_TYPE(u16) +DEFINE_UNALIGNED_TYPE(u32) +DEFINE_UNALIGNED_TYPE(u64) +DEFINE_UNALIGNED_TYPE(machine_word_t) + +#undef MEMCOPY + +#define load_word_unaligned load_machine_word_t_unaligned +#define store_word_unaligned store_machine_word_t_unaligned + +/* Unaligned loads with endianness conversion */ + +static forceinline u16 +get_unaligned_le16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le16_bswap(load_u16_unaligned(p)); + else + return ((u16)p[1] << 8) | p[0]; +} + +static forceinline u16 +get_unaligned_be16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be16_bswap(load_u16_unaligned(p)); + else + return ((u16)p[0] << 8) | p[1]; +} + +static forceinline u32 +get_unaligned_le32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le32_bswap(load_u32_unaligned(p)); + else + return ((u32)p[3] << 24) | ((u32)p[2] << 16) | + ((u32)p[1] << 8) | p[0]; +} + +static forceinline u32 +get_unaligned_be32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be32_bswap(load_u32_unaligned(p)); + else + return ((u32)p[0] << 24) | ((u32)p[1] << 16) | + ((u32)p[2] << 8) | p[3]; +} + +static forceinline u64 +get_unaligned_le64(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le64_bswap(load_u64_unaligned(p)); + else + return ((u64)p[7] << 56) | ((u64)p[6] << 48) | + ((u64)p[5] << 40) | ((u64)p[4] << 32) | + ((u64)p[3] << 24) | ((u64)p[2] << 16) | + ((u64)p[1] << 8) | p[0]; +} + +static forceinline machine_word_t +get_unaligned_leword(const u8 *p) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return get_unaligned_le32(p); + else + return get_unaligned_le64(p); +} + +/* Unaligned stores with endianness conversion */ + +static forceinline void +put_unaligned_le16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(le16_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + } +} + +static forceinline void +put_unaligned_be16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(be16_bswap(v), p); + } else { + p[0] = (u8)(v >> 8); + p[1] = (u8)(v >> 0); + } +} + +static forceinline void +put_unaligned_le32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(le32_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + } +} + +static forceinline void +put_unaligned_be32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(be32_bswap(v), p); + } else { + p[0] = (u8)(v >> 24); + p[1] = (u8)(v >> 16); + p[2] = (u8)(v >> 8); + p[3] = (u8)(v >> 0); + } +} + +static forceinline void +put_unaligned_le64(u64 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u64_unaligned(le64_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + p[4] = (u8)(v >> 32); + p[5] = (u8)(v >> 40); + p[6] = (u8)(v >> 48); + p[7] = (u8)(v >> 56); + } +} + +static forceinline void +put_unaligned_leword(machine_word_t v, u8 *p) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + put_unaligned_le32(v, p); + else + put_unaligned_le64(v, p); +} + +/* ========================================================================== */ +/* Bit manipulation functions */ +/* ========================================================================== */ + +/* + * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least + * significant end) of the *most* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +static forceinline unsigned +bsr32(u32 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_clz) + return 31 - __builtin_clz(v); +#elif defined(_MSC_VER) + unsigned long i; + + _BitScanReverse(&i, v); + return i; +#else + unsigned i = 0; + + while ((v >>= 1) != 0) + i++; + return i; +#endif +} + +static forceinline unsigned +bsr64(u64 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_clzll) + return 63 - __builtin_clzll(v); +#elif defined(_MSC_VER) && defined(_WIN64) + unsigned long i; + + _BitScanReverse64(&i, v); + return i; +#else + unsigned i = 0; + + while ((v >>= 1) != 0) + i++; + return i; +#endif +} + +static forceinline unsigned +bsrw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsr32(v); + else + return bsr64(v); +} + +/* + * Bit Scan Forward (BSF) - find the 0-based index (relative to the least + * significant end) of the *least* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +static forceinline unsigned +bsf32(u32 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_ctz) + return __builtin_ctz(v); +#elif defined(_MSC_VER) + unsigned long i; + + _BitScanForward(&i, v); + return i; +#else + unsigned i = 0; + + for (; (v & 1) == 0; v >>= 1) + i++; + return i; +#endif +} + +static forceinline unsigned +bsf64(u64 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_ctzll) + return __builtin_ctzll(v); +#elif defined(_MSC_VER) && defined(_WIN64) + unsigned long i; + + _BitScanForward64(&i, v); + return i; +#else + unsigned i = 0; + + for (; (v & 1) == 0; v >>= 1) + i++; + return i; +#endif +} + +static forceinline unsigned +bsfw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsf32(v); + else + return bsf64(v); +} + +/* + * rbit32(v): reverse the bits in a 32-bit integer. This doesn't have a + * fallback implementation; use '#ifdef rbit32' to check if this is available. + */ +#undef rbit32 +#if (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM32) && \ + (__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__))) +static forceinline u32 +rbit32(u32 v) +{ + __asm__("rbit %0, %1" : "=r" (v) : "r" (v)); + return v; +} +#define rbit32 rbit32 +#elif (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM64) +static forceinline u32 +rbit32(u32 v) +{ + __asm__("rbit %w0, %w1" : "=r" (v) : "r" (v)); + return v; +} +#define rbit32 rbit32 +#endif + +#endif /* COMMON_DEFS_H */ + + +typedef void *(*malloc_func_t)(size_t); +typedef void (*free_func_t)(void *); + +extern malloc_func_t libdeflate_default_malloc_func; +extern free_func_t libdeflate_default_free_func; + +void *libdeflate_aligned_malloc(malloc_func_t malloc_func, + size_t alignment, size_t size); +void libdeflate_aligned_free(free_func_t free_func, void *ptr); + +#ifdef FREESTANDING +/* + * With -ffreestanding, may be missing, and we must provide + * implementations of memset(), memcpy(), memmove(), and memcmp(). + * See https://gcc.gnu.org/onlinedocs/gcc/Standards.html + * + * Also, -ffreestanding disables interpreting calls to these functions as + * built-ins. E.g., calling memcpy(&v, p, WORDBYTES) will make a function call, + * not be optimized to a single load instruction. For performance reasons we + * don't want that. So, declare these functions as macros that expand to the + * corresponding built-ins. This approach is recommended in the gcc man page. + * We still need the actual function definitions in case gcc calls them. + */ +void *memset(void *s, int c, size_t n); +#define memset(s, c, n) __builtin_memset((s), (c), (n)) + +void *memcpy(void *dest, const void *src, size_t n); +#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n)) + +void *memmove(void *dest, const void *src, size_t n); +#define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) + +int memcmp(const void *s1, const void *s2, size_t n); +#define memcmp(s1, s2, n) __builtin_memcmp((s1), (s2), (n)) + +#undef LIBDEFLATE_ENABLE_ASSERTIONS +#else +#include +#endif + +/* + * Runtime assertion support. Don't enable this in production builds; it may + * hurt performance significantly. + */ +#ifdef LIBDEFLATE_ENABLE_ASSERTIONS +void libdeflate_assertion_failed(const char *expr, const char *file, int line); +#define ASSERT(expr) { if (unlikely(!(expr))) \ + libdeflate_assertion_failed(#expr, __FILE__, __LINE__); } +#else +#define ASSERT(expr) (void)(expr) +#endif + +#define CONCAT_IMPL(a, b) a##b +#define CONCAT(a, b) CONCAT_IMPL(a, b) +#define ADD_SUFFIX(name) CONCAT(name, SUFFIX) + +#endif /* LIB_LIB_COMMON_H */ + +/* + * deflate_constants.h - constants for the DEFLATE compression format + */ + +#ifndef LIB_DEFLATE_CONSTANTS_H +#define LIB_DEFLATE_CONSTANTS_H + +/* Valid block types */ +#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 +#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 +#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 + +/* Minimum and maximum supported match lengths (in bytes) */ +#define DEFLATE_MIN_MATCH_LEN 3 +#define DEFLATE_MAX_MATCH_LEN 258 + +/* Maximum supported match offset (in bytes) */ +#define DEFLATE_MAX_MATCH_OFFSET 32768 + +/* log2 of DEFLATE_MAX_MATCH_OFFSET */ +#define DEFLATE_WINDOW_ORDER 15 + +/* Number of symbols in each Huffman code. Note: for the literal/length + * and offset codes, these are actually the maximum values; a given block + * might use fewer symbols. */ +#define DEFLATE_NUM_PRECODE_SYMS 19 +#define DEFLATE_NUM_LITLEN_SYMS 288 +#define DEFLATE_NUM_OFFSET_SYMS 32 + +/* The maximum number of symbols across all codes */ +#define DEFLATE_MAX_NUM_SYMS 288 + +/* Division of symbols in the literal/length code */ +#define DEFLATE_NUM_LITERALS 256 +#define DEFLATE_END_OF_BLOCK 256 +#define DEFLATE_FIRST_LEN_SYM 257 + +/* Maximum codeword length, in bits, within each Huffman code */ +#define DEFLATE_MAX_PRE_CODEWORD_LEN 7 +#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 +#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 + +/* The maximum codeword length across all codes */ +#define DEFLATE_MAX_CODEWORD_LEN 15 + +/* Maximum possible overrun when decoding codeword lengths */ +#define DEFLATE_MAX_LENS_OVERRUN 137 + +/* + * Maximum number of extra bits that may be required to represent a match + * length or offset. + */ +#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 +#define DEFLATE_MAX_EXTRA_OFFSET_BITS 13 + +#endif /* LIB_DEFLATE_CONSTANTS_H */ + +/* + * cpu_features_common.h - code shared by all lib/$arch/cpu_features.c + * + * Copyright 2020 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_CPU_FEATURES_COMMON_H +#define LIB_CPU_FEATURES_COMMON_H + +#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) + /* for strdup() and strtok_r() */ +# undef _ANSI_SOURCE +# ifndef __APPLE__ +# undef _GNU_SOURCE +# define _GNU_SOURCE +# endif +# include +# include +# include +#endif + +struct cpu_feature { + u32 bit; + const char *name; +}; + +#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) +/* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */ +static inline void +disable_cpu_features_for_testing(u32 *features, + const struct cpu_feature *feature_table, + size_t feature_table_length) +{ + char *env_value, *strbuf, *p, *saveptr = NULL; + size_t i; + + env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES"); + if (!env_value) + return; + strbuf = strdup(env_value); + if (!strbuf) + abort(); + p = strtok_r(strbuf, ",", &saveptr); + while (p) { + for (i = 0; i < feature_table_length; i++) { + if (strcmp(p, feature_table[i].name) == 0) { + *features &= ~feature_table[i].bit; + break; + } + } + if (i == feature_table_length) { + fprintf(stderr, + "unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n", + p); + abort(); + } + p = strtok_r(NULL, ",", &saveptr); + } + free(strbuf); +} +#else /* TEST_SUPPORT__DO_NOT_USE */ +static inline void +disable_cpu_features_for_testing(u32 *features, + const struct cpu_feature *feature_table, + size_t feature_table_length) +{ +} +#endif /* !TEST_SUPPORT__DO_NOT_USE */ + +#endif /* LIB_CPU_FEATURES_COMMON_H */ + +/* + * x86/cpu_features.h - feature detection for x86 CPUs + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_CPU_FEATURES_H +#define LIB_X86_CPU_FEATURES_H + +#define HAVE_DYNAMIC_X86_CPU_FEATURES 0 + +#if defined(ARCH_X86_32) || defined(ARCH_X86_64) + +#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER) +# undef HAVE_DYNAMIC_X86_CPU_FEATURES +# define HAVE_DYNAMIC_X86_CPU_FEATURES 1 +#endif + +#define X86_CPU_FEATURE_SSE2 0x00000001 +#define X86_CPU_FEATURE_PCLMUL 0x00000002 +#define X86_CPU_FEATURE_AVX 0x00000004 +#define X86_CPU_FEATURE_AVX2 0x00000008 +#define X86_CPU_FEATURE_BMI2 0x00000010 + +#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) +#define HAVE_PCLMUL(features) (HAVE_PCLMUL_NATIVE || ((features) & X86_CPU_FEATURE_PCLMUL)) +#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX)) +#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2)) +#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2)) + +#if HAVE_DYNAMIC_X86_CPU_FEATURES +#define X86_CPU_FEATURES_KNOWN 0x80000000 +extern volatile u32 libdeflate_x86_cpu_features; + +void libdeflate_init_x86_cpu_features(void); + +static inline u32 get_x86_cpu_features(void) +{ + if (libdeflate_x86_cpu_features == 0) + libdeflate_init_x86_cpu_features(); + return libdeflate_x86_cpu_features; +} +#else /* HAVE_DYNAMIC_X86_CPU_FEATURES */ +static inline u32 get_x86_cpu_features(void) { return 0; } +#endif /* !HAVE_DYNAMIC_X86_CPU_FEATURES */ + +/* + * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics not + * available in the main target couldn't be used in 'target' attribute + * functions. Unfortunately clang has no feature test macro for this, so we + * have to check its version. + */ +#if HAVE_DYNAMIC_X86_CPU_FEATURES && \ + (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER)) +# define HAVE_TARGET_INTRINSICS 1 +#else +# define HAVE_TARGET_INTRINSICS 0 +#endif + +/* SSE2 */ +#if defined(__SSE2__) || \ + (defined(_MSC_VER) && \ + (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))) +# define HAVE_SSE2_NATIVE 1 +#else +# define HAVE_SSE2_NATIVE 0 +#endif +#define HAVE_SSE2_INTRIN (HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS) + +/* PCLMUL */ +#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) +# define HAVE_PCLMUL_NATIVE 1 +#else +# define HAVE_PCLMUL_NATIVE 0 +#endif +#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ + defined(_MSC_VER))) +# define HAVE_PCLMUL_INTRIN 1 +#else +# define HAVE_PCLMUL_INTRIN 0 +#endif + +/* AVX */ +#ifdef __AVX__ +# define HAVE_AVX_NATIVE 1 +#else +# define HAVE_AVX_NATIVE 0 +#endif +#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \ + defined(_MSC_VER))) +# define HAVE_AVX_INTRIN 1 +#else +# define HAVE_AVX_INTRIN 0 +#endif + +/* AVX2 */ +#ifdef __AVX2__ +# define HAVE_AVX2_NATIVE 1 +#else +# define HAVE_AVX2_NATIVE 0 +#endif +#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ + defined(_MSC_VER))) +# define HAVE_AVX2_INTRIN 1 +#else +# define HAVE_AVX2_INTRIN 0 +#endif + +/* BMI2 */ +#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) +# define HAVE_BMI2_NATIVE 1 +#else +# define HAVE_BMI2_NATIVE 0 +#endif +#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ + defined(_MSC_VER))) +# define HAVE_BMI2_INTRIN 1 +#else +# define HAVE_BMI2_INTRIN 0 +#endif + +#endif /* ARCH_X86_32 || ARCH_X86_64 */ + +#endif /* LIB_X86_CPU_FEATURES_H */ + + +/* + * If the expression passed to SAFETY_CHECK() evaluates to false, then the + * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the + * compressed data is invalid. + * + * Theoretically, these checks could be disabled for specialized applications + * where all input to the decompressor will be trusted. + */ +#if 0 +# pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") +# define SAFETY_CHECK(expr) (void)(expr) +#else +# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA +#endif + +/***************************************************************************** + * Input bitstream * + *****************************************************************************/ + +/* + * The state of the "input bitstream" consists of the following variables: + * + * - in_next: a pointer to the next unread byte in the input buffer + * + * - in_end: a pointer to just past the end of the input buffer + * + * - bitbuf: a word-sized variable containing bits that have been read from + * the input buffer or from the implicit appended zero bytes + * + * - bitsleft: the number of bits in 'bitbuf' available to be consumed. + * After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually + * contain more bits than this. However, only the bits counted + * by 'bitsleft' can actually be consumed; the rest can only be + * used for preloading. + * + * As a micro-optimization, we allow bits 8 and higher of + * 'bitsleft' to contain garbage. When consuming the bits + * associated with a decode table entry, this allows us to do + * 'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'. + * On some CPUs, this helps reduce instruction dependencies. + * This does have the disadvantage that 'bitsleft' sometimes + * needs to be cast to 'u8', such as when it's used as a shift + * amount in REFILL_BITS_BRANCHLESS(). But that one happens + * for free since most CPUs ignore high bits in shift amounts. + * + * - overread_count: the total number of implicit appended zero bytes that + * have been loaded into the bitbuffer, including any + * counted by 'bitsleft' and any already consumed + */ + +/* + * The type for the bitbuffer variable ('bitbuf' described above). For best + * performance, this should have size equal to a machine word. + * + * 64-bit platforms have a significant advantage: they get a bigger bitbuffer + * which they don't have to refill as often. + */ +typedef machine_word_t bitbuf_t; +#define BITBUF_NBITS (8 * (int)sizeof(bitbuf_t)) + +/* BITMASK(n) returns a bitmask of length 'n'. */ +#define BITMASK(n) (((bitbuf_t)1 << (n)) - 1) + +/* + * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value + * of '(u8)bitsleft'. This is the size of the bitbuffer variable, minus 1 if + * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()). + */ +#define MAX_BITSLEFT \ + (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS) + +/* + * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be + * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer. + * Since only whole bytes can be added to 'bitsleft', the worst case is + * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit. + */ +#define CONSUMABLE_NBITS (MAX_BITSLEFT - 7) + +/* + * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed + * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP(). (It is *not* + * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a + * byte-at-a-time refill method near the end of input.) This may exceed the + * number of consumable bits (counted by 'bitsleft'). Any bits not counted in + * 'bitsleft' can only be used for precomputation and cannot be consumed. + */ +#define FASTLOOP_PRELOADABLE_NBITS \ + (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS) + +/* + * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be + * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any + * subsequent consumptions. This is 1 bit if the branchless refill method is + * being used, and 0 bits otherwise. + */ +#define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT) + +/* + * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been + * refilled, then it's always possible to consume 'n' bits from it. 'n' should + * be a compile-time constant, to enable compile-time evaluation. + */ +#define CAN_CONSUME(n) (CONSUMABLE_NBITS >= (n)) + +/* + * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's + * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to + * consume 'consume_nbits' bits, then preload 'preload_nbits' bits. The + * arguments should be compile-time constants to enable compile-time evaluation. + */ +#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) \ + (CONSUMABLE_NBITS >= (consume_nbits) && \ + FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits)) + +/* + * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by + * reading the next word from the input buffer and updating 'in_next' and + * 'bitsleft' based on how many bits were refilled -- counting whole bytes only. + * This is much faster than reading a byte at a time, at least if the CPU is + * little endian and supports fast unaligned memory accesses. + * + * The simplest way of branchlessly updating 'bitsleft' would be: + * + * bitsleft += (MAX_BITSLEFT - bitsleft) & ~7; + * + * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than + * WORDBITS, so that in binary it looks like 111111 or 11111. Then, we update + * 'bitsleft' by just setting the bits above the low 3 bits: + * + * bitsleft |= MAX_BITSLEFT & ~7; + * + * That compiles down to a single instruction like 'or $0x38, %rbp'. Using + * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be + * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior. + * + * The simplest way of branchlessly updating 'in_next' would be: + * + * in_next += (MAX_BITSLEFT - bitsleft) >> 3; + * + * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this + * isn't really better: + * + * in_next += (MAX_BITSLEFT ^ bitsleft) >> 3; + * + * An alternative which can be marginally better is the following: + * + * in_next += sizeof(bitbuf_t) - 1; + * in_next -= (bitsleft >> 3) & 0x7; + * + * It seems this would increase the number of CPU instructions from 3 (sub, shr, + * add) to 4 (add, shr, and, sub). However, if the CPU has a bitfield + * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially + * more efficient because the length of the longest dependency chain decreases + * from 3 to 2. This alternative also has the advantage that it ignores the + * high bits in 'bitsleft', so it is compatible with the micro-optimization we + * use where we let the high bits of 'bitsleft' contain garbage. + */ +#define REFILL_BITS_BRANCHLESS() \ +do { \ + bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft; \ + in_next += sizeof(bitbuf_t) - 1; \ + in_next -= (bitsleft >> 3) & 0x7; \ + bitsleft |= MAX_BITSLEFT & ~7; \ +} while (0) + +/* + * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable + * contains at least CONSUMABLE_NBITS consumable bits. + * + * This checks for the end of input, and it doesn't guarantee + * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop. + * + * If we would overread the input buffer, we just don't read anything, leaving + * the bits zeroed but marking them filled. This simplifies the decompressor + * because it removes the need to always be able to distinguish between real + * overreads and overreads caused only by the decompressor's own lookahead. + * + * We do still keep track of the number of bytes that have been overread, for + * two reasons. First, it allows us to determine the exact number of bytes that + * were consumed once the stream ends or an uncompressed block is reached. + * Second, it allows us to stop early if the overread amount gets so large (more + * than sizeof bitbuf) that it can only be caused by a real overread. (The + * second part is arguably unneeded, since libdeflate is buffer-based; given + * infinite zeroes, it will eventually either completely fill the output buffer + * or return an error. However, we do it to be slightly more friendly to the + * not-recommended use case of decompressing with an unknown output size.) + */ +#define REFILL_BITS() \ +do { \ + if (UNALIGNED_ACCESS_IS_FAST && \ + likely(in_end - in_next >= sizeof(bitbuf_t))) { \ + REFILL_BITS_BRANCHLESS(); \ + } else { \ + while ((u8)bitsleft < CONSUMABLE_NBITS) { \ + if (likely(in_next != in_end)) { \ + bitbuf |= (bitbuf_t)*in_next++ << \ + (u8)bitsleft; \ + } else { \ + overread_count++; \ + SAFETY_CHECK(overread_count <= \ + sizeof(bitbuf_t)); \ + } \ + bitsleft += 8; \ + } \ + } \ +} while (0) + +/* + * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the + * end of the input. It can only be used in the fastloop. + */ +#define REFILL_BITS_IN_FASTLOOP() \ +do { \ + STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST || \ + FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS); \ + if (UNALIGNED_ACCESS_IS_FAST) { \ + REFILL_BITS_BRANCHLESS(); \ + } else { \ + while ((u8)bitsleft < CONSUMABLE_NBITS) { \ + bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \ + bitsleft += 8; \ + } \ + } \ +} while (0) + +/* + * This is the worst-case maximum number of output bytes that are written to + * during each iteration of the fastloop. The worst case is 2 literals, then a + * match of length DEFLATE_MAX_MATCH_LEN. Additionally, some slack space must + * be included for the intentional overrun in the match copy implementation. + */ +#define FASTLOOP_MAX_BYTES_WRITTEN \ + (2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1) + +/* + * This is the worst-case maximum number of input bytes that are read during + * each iteration of the fastloop. To get this value, we first compute the + * greatest number of bits that can be refilled during a loop iteration. The + * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can + * be refilled later is no more than the maximum amount that can be consumed by + * 2 literals that don't need a subtable, then a match. We convert this value + * to bytes, rounding up; this gives the maximum number of bytes that 'in_next' + * can be advanced. Finally, we add sizeof(bitbuf_t) to account for + * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'. + */ +#define FASTLOOP_MAX_BYTES_READ \ + (DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \ + LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \ + sizeof(bitbuf_t)) + +/***************************************************************************** + * Huffman decoding * + *****************************************************************************/ + +/* + * The fastest way to decode Huffman-encoded data is basically to use a decode + * table that maps the next TABLEBITS bits of data to their symbol. Each entry + * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'. A + * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table. + * + * Ideally, TABLEBITS and the maximum codeword length would be the same; some + * compression formats are designed with this goal in mind. Unfortunately, in + * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is + * too large for a practical TABLEBITS. It's not *that* much larger, though, so + * the workaround is to use a single level of subtables. In the main table, + * entries for prefixes of codewords longer than TABLEBITS contain a "pointer" + * to the appropriate subtable along with the number of bits it is indexed with. + * + * The most efficient way to allocate subtables is to allocate them dynamically + * after the main table. The worst-case number of table entries needed, + * including subtables, is precomputable; see the ENOUGH constants below. + * + * A useful optimization is to store the codeword lengths in the decode table so + * that they don't have to be looked up by indexing a separate table that maps + * symbols to their codeword lengths. We basically do this; however, for the + * litlen and offset codes we also implement some DEFLATE-specific optimizations + * that build in the consideration of the "extra bits" and the + * literal/length/end-of-block division. For the exact decode table entry + * format we use, see the definitions of the *_decode_results[] arrays below. + */ + + +/* + * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes, + * along with their corresponding ENOUGH values. + * + * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum + * precode codeword length. This avoids ever needing subtables. + * + * For the litlen and offset codes, we cannot realistically avoid ever needing + * subtables, since litlen and offset codewords can be up to 15 bits. A higher + * TABLEBITS reduces the number of lookups that need a subtable, which increases + * performance; however, it increases memory usage and makes building the table + * take longer, which decreases performance. We choose values that work well in + * practice, making subtables rarely needed without making the tables too large. + * + * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special + * considerations, 9 would fit the trade-off curve better. However, there is a + * performance benefit to using exactly 8 bits when it is a compile-time + * constant, as many CPUs can take the low byte more easily than the low 9 bits. + * + * zlib treats its equivalents of TABLEBITS as maximum values; whenever it + * builds a table, it caps the actual table_bits to the longest codeword. This + * makes sense in theory, as there's no need for the table to be any larger than + * needed to support the longest codeword. However, having the table bits be a + * compile-time constant is beneficial to the performance of the decode loop, so + * there is a trade-off. libdeflate currently uses the dynamic table_bits + * strategy for the litlen table only, due to its larger maximum size. + * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there + * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above. + * + * Each TABLEBITS value has a corresponding ENOUGH value that gives the + * worst-case maximum number of decode table entries, including the main table + * and all subtables. The ENOUGH value depends on three parameters: + * + * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) + * (2) the maximum number of main table bits (*_TABLEBITS) + * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) + * + * The ENOUGH values were computed using the utility program 'enough' from zlib. + */ +#define PRECODE_TABLEBITS 7 +#define PRECODE_ENOUGH 128 /* enough 19 7 7 */ +#define LITLEN_TABLEBITS 11 +#define LITLEN_ENOUGH 2342 /* enough 288 11 15 */ +#define OFFSET_TABLEBITS 8 +#define OFFSET_ENOUGH 402 /* enough 32 8 15 */ + +/* + * make_decode_table_entry() creates a decode table entry for the given symbol + * by combining the static part 'decode_results[sym]' with the dynamic part + * 'len', which is the remaining codeword length (the codeword length for main + * table entries, or the codeword length minus TABLEBITS for subtable entries). + * + * In all cases, we add 'len' to each of the two low-order bytes to create the + * appropriately-formatted decode table entry. See the definitions of the + * *_decode_results[] arrays below, where the entry format is described. + */ +static forceinline u32 +make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len) +{ + return decode_results[sym] + (len << 8) + len; +} + +/* + * Here is the format of our precode decode table entries. Bits not explicitly + * described contain zeroes: + * + * Bit 20-16: presym + * Bit 10-8: codeword length [not used] + * Bit 2-0: codeword length + * + * The precode decode table never has subtables, since we use + * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN. + * + * precode_decode_results[] contains the static part of the entry for each + * symbol. make_decode_table_entry() produces the final entries. + */ +static const u32 precode_decode_results[] = { +#define ENTRY(presym) ((u32)presym << 16) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , +#undef ENTRY +}; + +/* Litlen and offset decode table entry flags */ + +/* Indicates a literal entry in the litlen decode table */ +#define HUFFDEC_LITERAL 0x80000000 + +/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */ +#define HUFFDEC_EXCEPTIONAL 0x00008000 + +/* Indicates a subtable pointer entry in the litlen or offset decode table */ +#define HUFFDEC_SUBTABLE_POINTER 0x00004000 + +/* Indicates an end-of-block entry in the litlen decode table */ +#define HUFFDEC_END_OF_BLOCK 0x00002000 + +/* Maximum number of bits that can be consumed by decoding a match length */ +#define LENGTH_MAXBITS (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS) +#define LENGTH_MAXFASTBITS (LITLEN_TABLEBITS /* no subtable needed */ + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS) + +/* + * Here is the format of our litlen decode table entries. Bits not explicitly + * described contain zeroes: + * + * Literals: + * Bit 31: 1 (HUFFDEC_LITERAL) + * Bit 23-16: literal value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length [not used] + * Bit 3-0: remaining codeword length + * Lengths: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 24-16: length base value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length + * Bit 4-0: remaining codeword length + number of extra bits + * End of block: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 1 (HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length [not used] + * Bit 3-0: remaining codeword length + * Subtable pointer: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 30-16: index of start of subtable + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: number of subtable bits + * Bit 3-0: number of main table bits + * + * This format has several desirable properties: + * + * - The codeword length, length slot base, and number of extra length bits + * are all built in. This eliminates the need to separately look up this + * information by indexing separate arrays by symbol or length slot. + * + * - The HUFFDEC_* flags enable easily distinguishing between the different + * types of entries. The HUFFDEC_LITERAL flag enables a fast path for + * literals; the high bit is used for this, as some CPUs can test the + * high bit more easily than other bits. The HUFFDEC_EXCEPTIONAL flag + * makes it possible to detect the two unlikely cases (subtable pointer + * and end of block) in a single bit flag test. + * + * - The low byte is the number of bits that need to be removed from the + * bitstream; this makes this value easily accessible, and it enables the + * micro-optimization of doing 'bitsleft -= entry' instead of + * 'bitsleft -= (u8)entry'. It also includes the number of extra bits, + * so they don't need to be removed separately. + * + * - The flags in bits 15-13 are arranged to be 0 when the + * "remaining codeword length" in bits 11-8 is needed, making this value + * fairly easily accessible as well via a shift and downcast. + * + * - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are + * needed, making it possible to extract this value with '& 0x3F' rather + * than '& 0xF'. This value is only used as a shift amount, so this can + * save an 'and' instruction as the masking by 0x3F happens implicitly. + * + * litlen_decode_results[] contains the static part of the entry for each + * symbol. make_decode_table_entry() produces the final entries. + */ +static const u32 litlen_decode_results[] = { + + /* Literals */ +#define ENTRY(literal) (HUFFDEC_LITERAL | ((u32)literal << 16)) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , + ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , + ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , + ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , + ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , + ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , + ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , + ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , + ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , + ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , + ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , + ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , + ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , + ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , + ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , + ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , + ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , + ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , + ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , + ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , + ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , + ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , + ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , + ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , + ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , + ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , + ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , + ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , + ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , + ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , + ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , + ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , + ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , + ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , + ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , + ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , + ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , + ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , + ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , + ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , + ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , + ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , + ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , + ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , + ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , + ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , + ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , + ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , + ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , + ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , + ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , + ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , + ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , + ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , + ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , + ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , + ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , + ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , + ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , + ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , +#undef ENTRY + + /* End of block */ + HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK, + + /* Lengths */ +#define ENTRY(length_base, num_extra_bits) \ + (((u32)(length_base) << 16) | (num_extra_bits)) + ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), + ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), + ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), + ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), + ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), + ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), + ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), + ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , +#undef ENTRY +}; + +/* Maximum number of bits that can be consumed by decoding a match offset */ +#define OFFSET_MAXBITS (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS) +#define OFFSET_MAXFASTBITS (OFFSET_TABLEBITS /* no subtable needed */ + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS) + +/* + * Here is the format of our offset decode table entries. Bits not explicitly + * described contain zeroes: + * + * Offsets: + * Bit 31-16: offset base value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 11-8: remaining codeword length + * Bit 4-0: remaining codeword length + number of extra bits + * Subtable pointer: + * Bit 31-16: index of start of subtable + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) + * Bit 11-8: number of subtable bits + * Bit 3-0: number of main table bits + * + * These work the same way as the length entries and subtable pointer entries in + * the litlen decode table; see litlen_decode_results[] above. + */ +static const u32 offset_decode_results[] = { +#define ENTRY(offset_base, num_extra_bits) \ + (((u32)(offset_base) << 16) | (num_extra_bits)) + ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , + ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , + ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , + ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , + ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , + ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , + ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , + ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , +#undef ENTRY +}; + +/* + * The main DEFLATE decompressor structure. Since libdeflate only supports + * full-buffer decompression, this structure doesn't store the entire + * decompression state, most of which is in stack variables. Instead, this + * struct just contains the decode tables and some temporary arrays used for + * building them, as these are too large to comfortably allocate on the stack. + * + * Storing the decode tables in the decompressor struct also allows the decode + * tables for the static codes to be reused whenever two static Huffman blocks + * are decoded without an intervening dynamic block, even across streams. + */ +struct libdeflate_decompressor { + + /* + * The arrays aren't all needed at the same time. 'precode_lens' and + * 'precode_decode_table' are unneeded after 'lens' has been filled. + * Furthermore, 'lens' need not be retained after building the litlen + * and offset decode tables. In fact, 'lens' can be in union with + * 'litlen_decode_table' provided that 'offset_decode_table' is separate + * and is built first. + */ + + union { + u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; + + struct { + u8 lens[DEFLATE_NUM_LITLEN_SYMS + + DEFLATE_NUM_OFFSET_SYMS + + DEFLATE_MAX_LENS_OVERRUN]; + + u32 precode_decode_table[PRECODE_ENOUGH]; + } l; + + u32 litlen_decode_table[LITLEN_ENOUGH]; + } u; + + u32 offset_decode_table[OFFSET_ENOUGH]; + + /* used only during build_decode_table() */ + u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; + + bool static_codes_loaded; + unsigned litlen_tablebits; + + /* The free() function for this struct, chosen at allocation time */ + free_func_t free_func; +}; + +/* + * Build a table for fast decoding of symbols from a Huffman code. As input, + * this function takes the codeword length of each symbol which may be used in + * the code. As output, it produces a decode table for the canonical Huffman + * code described by the codeword lengths. The decode table is built with the + * assumption that it will be indexed with "bit-reversed" codewords, where the + * low-order bit is the first bit of the codeword. This format is used for all + * Huffman codes in DEFLATE. + * + * @decode_table + * The array in which the decode table will be generated. This array must + * have sufficient length; see the definition of the ENOUGH numbers. + * @lens + * An array which provides, for each symbol, the length of the + * corresponding codeword in bits, or 0 if the symbol is unused. This may + * alias @decode_table, since nothing is written to @decode_table until all + * @lens have been consumed. All codeword lengths are assumed to be <= + * @max_codeword_len but are otherwise considered untrusted. If they do + * not form a valid Huffman code, then the decode table is not built and + * %false is returned. + * @num_syms + * The number of symbols in the code, including all unused symbols. + * @decode_results + * An array which gives the incomplete decode result for each symbol. The + * needed values in this array will be combined with codeword lengths to + * make the final decode table entries using make_decode_table_entry(). + * @table_bits + * The log base-2 of the number of main table entries to use. + * If @table_bits_ret != NULL, then @table_bits is treated as a maximum + * value and it will be decreased if a smaller table would be sufficient. + * @max_codeword_len + * The maximum allowed codeword length for this Huffman code. + * Must be <= DEFLATE_MAX_CODEWORD_LEN. + * @sorted_syms + * A temporary array of length @num_syms. + * @table_bits_ret + * If non-NULL, then the dynamic table_bits is enabled, and the actual + * table_bits value will be returned here. + * + * Returns %true if successful; %false if the codeword lengths do not form a + * valid Huffman code. + */ +static bool +build_decode_table(u32 decode_table[], + const u8 lens[], + const unsigned num_syms, + const u32 decode_results[], + unsigned table_bits, + unsigned max_codeword_len, + u16 *sorted_syms, + unsigned *table_bits_ret) +{ + unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned sym; /* current symbol */ + unsigned codeword; /* current codeword, bit-reversed */ + unsigned len; /* current codeword length in bits */ + unsigned count; /* num codewords remaining with this length */ + u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ + unsigned cur_table_end; /* end index of current table */ + unsigned subtable_prefix; /* codeword prefix of current subtable */ + unsigned subtable_start; /* start index of current subtable */ + unsigned subtable_bits; /* log2 of current subtable length */ + + /* Count how many codewords have each length, including 0. */ + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + for (sym = 0; sym < num_syms; sym++) + len_counts[lens[sym]]++; + + /* + * Determine the actual maximum codeword length that was used, and + * decrease table_bits to it if allowed. + */ + while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0) + max_codeword_len--; + if (table_bits_ret != NULL) { + table_bits = MIN(table_bits, max_codeword_len); + *table_bits_ret = table_bits; + } + + /* + * Sort the symbols primarily by increasing codeword length and + * secondarily by increasing symbol value; or equivalently by their + * codewords in lexicographic order, since a canonical code is assumed. + * + * For efficiency, also compute 'codespace_used' in the same pass over + * 'len_counts[]' used to build 'offsets[]' for sorting. + */ + + /* Ensure that 'codespace_used' cannot overflow. */ + STATIC_ASSERT(sizeof(codespace_used) == 4); + STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= + DEFLATE_MAX_NUM_SYMS); + + offsets[0] = 0; + offsets[1] = len_counts[0]; + codespace_used = 0; + for (len = 1; len < max_codeword_len; len++) { + offsets[len + 1] = offsets[len] + len_counts[len]; + codespace_used = (codespace_used << 1) + len_counts[len]; + } + codespace_used = (codespace_used << 1) + len_counts[len]; + + for (sym = 0; sym < num_syms; sym++) + sorted_syms[offsets[lens[sym]]++] = sym; + + sorted_syms += offsets[0]; /* Skip unused symbols */ + + /* lens[] is done being used, so we can write to decode_table[] now. */ + + /* + * Check whether the lengths form a complete code (exactly fills the + * codespace), an incomplete code (doesn't fill the codespace), or an + * overfull code (overflows the codespace). A codeword of length 'n' + * uses proportion '1/(2^n)' of the codespace. An overfull code is + * nonsensical, so is considered invalid. An incomplete code is + * considered valid only in two specific cases; see below. + */ + + /* overfull code? */ + if (unlikely(codespace_used > (1U << max_codeword_len))) + return false; + + /* incomplete code? */ + if (unlikely(codespace_used < (1U << max_codeword_len))) { + u32 entry; + unsigned i; + + if (codespace_used == 0) { + /* + * An empty code is allowed. This can happen for the + * offset code in DEFLATE, since a dynamic Huffman block + * need not contain any matches. + */ + + /* sym=0, len=1 (arbitrary) */ + entry = make_decode_table_entry(decode_results, 0, 1); + } else { + /* + * Allow codes with a single used symbol, with codeword + * length 1. The DEFLATE RFC is unclear regarding this + * case. What zlib's decompressor does is permit this + * for the litlen and offset codes and assume the + * codeword is '0' rather than '1'. We do the same + * except we allow this for precodes too, since there's + * no convincing reason to treat the codes differently. + * We also assign both codewords '0' and '1' to the + * symbol to avoid having to handle '1' specially. + */ + if (codespace_used != (1U << (max_codeword_len - 1)) || + len_counts[1] != 1) + return false; + entry = make_decode_table_entry(decode_results, + *sorted_syms, 1); + } + /* + * Note: the decode table still must be fully initialized, in + * case the stream is malformed and contains bits from the part + * of the codespace the incomplete code doesn't use. + */ + for (i = 0; i < (1U << table_bits); i++) + decode_table[i] = entry; + return true; + } + + /* + * The lengths form a complete code. Now, enumerate the codewords in + * lexicographic order and fill the decode table entries for each one. + * + * First, process all codewords with len <= table_bits. Each one gets + * '2^(table_bits-len)' direct entries in the table. + * + * Since DEFLATE uses bit-reversed codewords, these entries aren't + * consecutive but rather are spaced '2^len' entries apart. This makes + * filling them naively somewhat awkward and inefficient, since strided + * stores are less cache-friendly and preclude the use of word or + * vector-at-a-time stores to fill multiple entries per instruction. + * + * To optimize this, we incrementally double the table size. When + * processing codewords with length 'len', the table is treated as + * having only '2^len' entries, so each codeword uses just one entry. + * Then, each time 'len' is incremented, the table size is doubled and + * the first half is copied to the second half. This significantly + * improves performance over naively doing strided stores. + * + * Note that some entries copied for each table doubling may not have + * been initialized yet, but it doesn't matter since they're guaranteed + * to be initialized later (because the Huffman code is complete). + */ + codeword = 0; + len = 1; + while ((count = len_counts[len]) == 0) + len++; + cur_table_end = 1U << len; + while (len <= table_bits) { + /* Process all 'count' codewords with length 'len' bits. */ + do { + unsigned bit; + + /* Fill the first entry for the current codeword. */ + decode_table[codeword] = + make_decode_table_entry(decode_results, + *sorted_syms++, len); + + if (codeword == cur_table_end - 1) { + /* Last codeword (all 1's) */ + for (; len < table_bits; len++) { + memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * + sizeof(decode_table[0])); + cur_table_end <<= 1; + } + return true; + } + /* + * To advance to the lexicographically next codeword in + * the canonical code, the codeword must be incremented, + * then 0's must be appended to the codeword as needed + * to match the next codeword's length. + * + * Since the codeword is bit-reversed, appending 0's is + * a no-op. However, incrementing it is nontrivial. To + * do so efficiently, use the 'bsr' instruction to find + * the last (highest order) 0 bit in the codeword, set + * it, and clear any later (higher order) 1 bits. But + * 'bsr' actually finds the highest order 1 bit, so to + * use it first flip all bits in the codeword by XOR'ing + * it with (1U << len) - 1 == cur_table_end - 1. + */ + bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); + codeword &= bit - 1; + codeword |= bit; + } while (--count); + + /* Advance to the next codeword length. */ + do { + if (++len <= table_bits) { + memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * sizeof(decode_table[0])); + cur_table_end <<= 1; + } + } while ((count = len_counts[len]) == 0); + } + + /* Process codewords with len > table_bits. These require subtables. */ + cur_table_end = 1U << table_bits; + subtable_prefix = -1; + subtable_start = 0; + for (;;) { + u32 entry; + unsigned i; + unsigned stride; + unsigned bit; + + /* + * Start a new subtable if the first 'table_bits' bits of the + * codeword don't match the prefix of the current subtable. + */ + if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { + subtable_prefix = (codeword & ((1U << table_bits) - 1)); + subtable_start = cur_table_end; + /* + * Calculate the subtable length. If the codeword has + * length 'table_bits + n', then the subtable needs + * '2^n' entries. But it may need more; if fewer than + * '2^n' codewords of length 'table_bits + n' remain, + * then the length will need to be incremented to bring + * in longer codewords until the subtable can be + * completely filled. Note that because the Huffman + * code is complete, it will always be possible to fill + * the subtable eventually. + */ + subtable_bits = len - table_bits; + codespace_used = count; + while (codespace_used < (1U << subtable_bits)) { + subtable_bits++; + codespace_used = (codespace_used << 1) + + len_counts[table_bits + subtable_bits]; + } + cur_table_end = subtable_start + (1U << subtable_bits); + + /* + * Create the entry that points from the main table to + * the subtable. + */ + decode_table[subtable_prefix] = + ((u32)subtable_start << 16) | + HUFFDEC_EXCEPTIONAL | + HUFFDEC_SUBTABLE_POINTER | + (subtable_bits << 8) | table_bits; + } + + /* Fill the subtable entries for the current codeword. */ + entry = make_decode_table_entry(decode_results, *sorted_syms++, + len - table_bits); + i = subtable_start + (codeword >> table_bits); + stride = 1U << (len - table_bits); + do { + decode_table[i] = entry; + i += stride; + } while (i < cur_table_end); + + /* Advance to the next codeword. */ + if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ + return true; + bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); + codeword &= bit - 1; + codeword |= bit; + count--; + while (count == 0) + count = len_counts[++len]; + } +} + +/* Build the decode table for the precode. */ +static bool +build_precode_decode_table(struct libdeflate_decompressor *d) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); + + STATIC_ASSERT(ARRAY_LEN(precode_decode_results) == + DEFLATE_NUM_PRECODE_SYMS); + + return build_decode_table(d->u.l.precode_decode_table, + d->u.precode_lens, + DEFLATE_NUM_PRECODE_SYMS, + precode_decode_results, + PRECODE_TABLEBITS, + DEFLATE_MAX_PRE_CODEWORD_LEN, + d->sorted_syms, + NULL); +} + +/* Build the decode table for the literal/length code. */ +static bool +build_litlen_decode_table(struct libdeflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342); + + STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) == + DEFLATE_NUM_LITLEN_SYMS); + + return build_decode_table(d->u.litlen_decode_table, + d->u.l.lens, + num_litlen_syms, + litlen_decode_results, + LITLEN_TABLEBITS, + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + d->sorted_syms, + &d->litlen_tablebits); +} + +/* Build the decode table for the offset code. */ +static bool +build_offset_decode_table(struct libdeflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); + + STATIC_ASSERT(ARRAY_LEN(offset_decode_results) == + DEFLATE_NUM_OFFSET_SYMS); + + return build_decode_table(d->offset_decode_table, + d->u.l.lens + num_litlen_syms, + num_offset_syms, + offset_decode_results, + OFFSET_TABLEBITS, + DEFLATE_MAX_OFFSET_CODEWORD_LEN, + d->sorted_syms, + NULL); +} + +/***************************************************************************** + * Main decompression routine + *****************************************************************************/ + +typedef enum libdeflate_result (*decompress_func_t) + (struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + +#define FUNCNAME deflate_decompress_default +#undef ATTRIBUTES +#undef EXTRACT_VARBITS +#undef EXTRACT_VARBITS8 +/* + * decompress_template.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This is the actual DEFLATE decompression routine, lifted out of + * deflate_decompress.c so that it can be compiled multiple times with different + * target instruction sets. + */ + +#ifndef ATTRIBUTES +# define ATTRIBUTES +#endif +#ifndef EXTRACT_VARBITS +# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count)) +#endif +#ifndef EXTRACT_VARBITS8 +# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) +#endif + +static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED +FUNCNAME(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes_avail; + u8 * const out_fastloop_end = + out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); + + /* Input bitstream state; see deflate_decompress.c for documentation */ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + const u8 * const in_fastloop_end = + in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); + bitbuf_t bitbuf = 0; + bitbuf_t saved_bitbuf; + u32 bitsleft = 0; + size_t overread_count = 0; + + bool is_final_block; + unsigned block_type; + unsigned num_litlen_syms; + unsigned num_offset_syms; + bitbuf_t litlen_tablemask; + u32 entry; + +next_block: + /* Starting to read the next block */ + ; + + STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); + REFILL_BITS(); + + /* BFINAL: 1 bit */ + is_final_block = bitbuf & BITMASK(1); + + /* BTYPE: 2 bits */ + block_type = (bitbuf >> 1) & BITMASK(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block */ + + /* The order in which precode lengths are stored */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + unsigned i; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); + num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); + num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); + num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); + + d->static_codes_loaded = false; + + /* + * Read the precode codeword lengths. + * + * A 64-bit bitbuffer is just one bit too small to hold the + * maximum number of precode lens, so to minimize branches we + * merge one len with the previous fields. + */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + d->u.precode_lens[deflate_precode_lens_permutation[0]] = + (bitbuf >> 17) & BITMASK(3); + bitbuf >>= 20; + bitsleft -= 20; + REFILL_BITS(); + i = 1; + do { + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } else { + bitbuf >>= 17; + bitsleft -= 17; + i = 0; + do { + if ((u8)bitsleft < 3) + REFILL_BITS(); + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + /* Decode the litlen and offset codeword lengths. */ + i = 0; + do { + unsigned presym; + u8 rep_val; + unsigned rep_count; + + if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) + REFILL_BITS(); + + /* + * The code below assumes that the precode decode table + * doesn't have any subtables. + */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Decode the next precode symbol. */ + entry = d->u.l.precode_decode_table[ + bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + presym = entry >> 16; + + if (presym < 16) { + /* Explicit codeword length */ + d->u.l.lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* + * Note: we don't need to immediately verify that the + * repeat count doesn't overflow the number of elements, + * since we've sized the lens array to have enough extra + * space to allow for the worst-case overrun (138 zeroes + * when only 1 length was remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times. */ + SAFETY_CHECK(i != 0); + rep_val = d->u.l.lens[i - 1]; + STATIC_ASSERT(3 + BITMASK(2) == 6); + rep_count = 3 + (bitbuf & BITMASK(2)); + bitbuf >>= 2; + bitsleft -= 2; + d->u.l.lens[i + 0] = rep_val; + d->u.l.lens[i + 1] = rep_val; + d->u.l.lens[i + 2] = rep_val; + d->u.l.lens[i + 3] = rep_val; + d->u.l.lens[i + 4] = rep_val; + d->u.l.lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times. */ + STATIC_ASSERT(3 + BITMASK(3) == 10); + rep_count = 3 + (bitbuf & BITMASK(3)); + bitbuf >>= 3; + bitsleft -= 3; + d->u.l.lens[i + 0] = 0; + d->u.l.lens[i + 1] = 0; + d->u.l.lens[i + 2] = 0; + d->u.l.lens[i + 3] = 0; + d->u.l.lens[i + 4] = 0; + d->u.l.lens[i + 5] = 0; + d->u.l.lens[i + 6] = 0; + d->u.l.lens[i + 7] = 0; + d->u.l.lens[i + 8] = 0; + d->u.l.lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times. */ + STATIC_ASSERT(11 + BITMASK(7) == 138); + rep_count = 11 + (bitbuf & BITMASK(7)); + bitbuf >>= 7; + bitsleft -= 7; + memset(&d->u.l.lens[i], 0, + rep_count * sizeof(d->u.l.lens[i])); + i += rep_count; + } + } while (i < num_litlen_syms + num_offset_syms); + + /* Unnecessary, but check this for consistency with zlib. */ + SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); + + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + u16 len, nlen; + + /* + * Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. + */ + + bitsleft -= 3; /* for BTYPE and BFINAL */ + + /* + * Align the bitstream to the next byte boundary. This means + * the next byte boundary as if we were reading a byte at a + * time. Therefore, we have to rewind 'in_next' by any bytes + * that have been refilled but not actually consumed yet (not + * counting overread bytes, which don't increment 'in_next'). + */ + bitsleft = (u8)bitsleft; + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + in_next -= (bitsleft >> 3) - overread_count; + overread_count = 0; + bitbuf = 0; + bitsleft = 0; + + SAFETY_CHECK(in_end - in_next >= 4); + len = get_unaligned_le16(in_next); + nlen = get_unaligned_le16(in_next + 2); + in_next += 4; + + SAFETY_CHECK(len == (u16)~nlen); + if (unlikely(len > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + SAFETY_CHECK(len <= in_end - in_next); + + memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else { + unsigned i; + + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* + * Static Huffman block: build the decode tables for the static + * codes. Skip doing so if the tables are already set up from + * an earlier static block; this speeds up decompression of + * degenerate input of many empty or very short static blocks. + * + * Afterwards, the remainder is the same as decompressing a + * dynamic Huffman block. + */ + + bitbuf >>= 3; /* for BTYPE and BFINAL */ + bitsleft -= 3; + + if (d->static_codes_loaded) + goto have_decode_tables; + + d->static_codes_loaded = true; + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->u.l.lens[i] = 8; + for (; i < 256; i++) + d->u.l.lens[i] = 9; + for (; i < 280; i++) + d->u.l.lens[i] = 7; + for (; i < 288; i++) + d->u.l.lens[i] = 8; + + for (; i < 288 + 32; i++) + d->u.l.lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); +have_decode_tables: + litlen_tablemask = BITMASK(d->litlen_tablebits); + + /* + * This is the "fastloop" for decoding literals and matches. It does + * bounds checks on in_next and out_next in the loop conditions so that + * additional bounds checks aren't needed inside the loop body. + * + * To reduce latency, the bitbuffer is refilled and the next litlen + * decode table entry is preloaded before each loop iteration. + */ + if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) + goto generic_loop; + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + do { + u32 length, offset, lit; + const u8 *src; + u8 *dst; + + /* + * Consume the bits for the litlen decode table entry. Save the + * original bitbuf for later, in case the extra match length + * bits need to be extracted from it. + */ + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + + /* + * Begin by checking for a "fast" literal, i.e. a literal that + * doesn't need a subtable. + */ + if (entry & HUFFDEC_LITERAL) { + /* + * On 64-bit platforms, we decode up to 2 extra fast + * literals in addition to the primary item, as this + * increases performance and still leaves enough bits + * remaining for what follows. We could actually do 3, + * assuming LITLEN_TABLEBITS=11, but that actually + * decreases performance slightly (perhaps by messing + * with the branch prediction of the conditional refill + * that happens later while decoding the match offset). + * + * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN + * and FASTLOOP_MAX_BYTES_READ need to be updated if the + * number of extra literals decoded here is changed. + */ + if (/* enough bits for 2 fast literals + length + offset preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + LENGTH_MAXBITS, + OFFSET_TABLEBITS) && + /* enough bits for 2 fast literals + slow literal + litlen preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS)) { + /* 1st extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* 2nd extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* + * Another fast literal, but + * this one is in lieu of the + * primary item, so it doesn't + * count as one of the extras. + */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + } else { + /* + * Decode a literal. While doing so, preload + * the next litlen decode table entry and refill + * the bitbuffer. To reduce latency, we've + * arranged for there to be enough "preloadable" + * bits remaining to do the table preload + * independently of the refill. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( + LITLEN_TABLEBITS, LITLEN_TABLEBITS)); + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + + /* + * It's not a literal entry, so it can be a length entry, a + * subtable pointer entry, or an end-of-block entry. Detect the + * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Subtable pointer or end-of-block entry */ + + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + + /* + * A subtable is required. Load and consume the + * subtable entry. The subtable entry can be of any + * type: literal, length, or end-of-block. + */ + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + + /* + * 32-bit platforms that use the byte-at-a-time refill + * method have to do a refill here for there to always + * be enough bits to decode a literal that requires a + * subtable, then preload the next litlen decode table + * entry; or to decode a match length that requires a + * subtable, then preload the offset decode table entry. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS) || + !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, + OFFSET_TABLEBITS)) + REFILL_BITS_IN_FASTLOOP(); + if (entry & HUFFDEC_LITERAL) { + /* Decode a literal that required a subtable. */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + /* Else, it's a length that required a subtable. */ + } + + /* + * Decode the match length: the length base value associated + * with the litlen symbol (which we extract from the decode + * table entry), plus the extra length bits. We don't need to + * consume the extra length bits here, as they were included in + * the bits consumed by the entry earlier. We also don't need + * to check for too-long matches here, as this is inside the + * fastloop where it's already been verified that the output + * buffer has enough space remaining to copy a max-length match. + */ + length = entry >> 16; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* + * Decode the match offset. There are enough "preloadable" bits + * remaining to preload the offset decode table entry, but a + * refill might be needed before consuming it. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, + OFFSET_TABLEBITS)); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, + LITLEN_TABLEBITS)) { + /* + * Decoding a match offset on a 64-bit platform. We may + * need to refill once, but then we can decode the whole + * offset and preload the next litlen table entry. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + if (unlikely((u8)bitsleft < OFFSET_MAXBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + } else { + /* Decoding a match offset on a 32-bit platform */ + REFILL_BITS_IN_FASTLOOP(); + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + REFILL_BITS_IN_FASTLOOP(); + /* No further refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME( + OFFSET_MAXBITS - OFFSET_TABLEBITS)); + } else { + /* No refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); + } + } + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + offset = entry >> 16; + offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* Validate the match offset; needed even in the fastloop. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + /* + * Before starting to issue the instructions to copy the match, + * refill the bitbuffer and preload the litlen decode table + * entry for the next loop iteration. This can increase + * performance by allowing the latency of the match copy to + * overlap with these other operations. To further reduce + * latency, we've arranged for there to be enough bits remaining + * to do the table preload independently of the refill, except + * on 32-bit platforms using the byte-at-a-time refill method. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD( + MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, + OFFSET_MAXFASTBITS), + LITLEN_TABLEBITS) && + unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + + /* + * Copy the match. On most CPUs the fastest method is a + * word-at-a-time copy, unconditionally copying about 5 words + * since this is enough for most matches without being too much. + * + * The normal word-at-a-time copy works for offset >= WORDBYTES, + * which is most cases. The case of offset == 1 is also common + * and is worth optimizing for, since it is just RLE encoding of + * the previous byte, which is the result of compressing long + * runs of the same byte. + * + * Writing past the match 'length' is allowed here, since it's + * been ensured there is enough output space left for a slight + * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if + * the maximum possible overrun here is changed. + */ + if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { + machine_word_t v; + + /* + * This part tends to get auto-vectorized, so keep it + * copying a multiple of 16 bytes at a time. + */ + v = (machine_word_t)0x0101010101010101 * src[0]; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST) { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + do { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + } while (dst < out_next); + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } while (in_next < in_fastloop_end && out_next < out_fastloop_end); + + /* + * This is the generic loop for decoding literals and matches. This + * handles cases where in_next and out_next are close to the end of + * their respective buffers. Usually this loop isn't performance- + * critical, as most time is spent in the fastloop above instead. We + * therefore omit some optimizations here in favor of smaller code. + */ +generic_loop: + for (;;) { + u32 length, offset; + const u8 *src; + u8 *dst; + + REFILL_BITS(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + } + length = entry >> 16; + if (entry & HUFFDEC_LITERAL) { + if (unlikely(out_next == out_end)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + *out_next++ = length; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + if (unlikely(length > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + + if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) + REFILL_BITS(); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + if (!CAN_CONSUME(OFFSET_MAXBITS)) + REFILL_BITS(); + } + offset = entry >> 16; + offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); + bitbuf >>= (u8)entry; + bitsleft -= entry; + + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + +block_done: + /* Finished decoding a block */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. */ + + bitsleft = (u8)bitsleft; + + /* + * If any of the implicit appended zero bytes were consumed (not just + * refilled) before hitting end of stream, then the data is bad. + */ + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + + /* Optionally return the actual number of bytes consumed. */ + if (actual_in_nbytes_ret) { + /* Don't count bytes that were refilled but not consumed. */ + in_next -= (bitsleft >> 3) - overread_count; + + *actual_in_nbytes_ret = in_next - (u8 *)in; + } + + /* Optionally return the actual number of bytes written. */ + if (actual_out_nbytes_ret) { + *actual_out_nbytes_ret = out_next - (u8 *)out; + } else { + if (out_next != out_end) + return LIBDEFLATE_SHORT_OUTPUT; + } + return LIBDEFLATE_SUCCESS; +} + +#undef FUNCNAME +#undef ATTRIBUTES +#undef EXTRACT_VARBITS +#undef EXTRACT_VARBITS8 + + +/* Include architecture-specific implementation(s) if available. */ +#undef DEFAULT_IMPL +#undef arch_select_decompress_func +#if defined(ARCH_X86_32) || defined(ARCH_X86_64) +#ifndef LIB_X86_DECOMPRESS_IMPL_H +#define LIB_X86_DECOMPRESS_IMPL_H + +/* + * BMI2 optimized version + * + * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation + * enabled yet. That would require that this be moved to its own .c file. + */ +#if HAVE_BMI2_INTRIN +# define deflate_decompress_bmi2 deflate_decompress_bmi2 +# define FUNCNAME deflate_decompress_bmi2 +# if !HAVE_BMI2_NATIVE +# define ATTRIBUTES _target_attribute("bmi2") +# endif + /* + * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the + * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic + * explicitly. EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)'; + * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'. + * Nevertheless, their implementation using the bzhi intrinsic is identical, + * as the bzhi instruction truncates the count to 8 bits implicitly. + */ +# ifndef __clang__ +# include +# ifdef ARCH_X86_64 +# define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count)) +# define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count)) +# else +# define EXTRACT_VARBITS(word, count) _bzhi_u32((word), (count)) +# define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count)) +# endif +# endif +/* + * decompress_template.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This is the actual DEFLATE decompression routine, lifted out of + * deflate_decompress.c so that it can be compiled multiple times with different + * target instruction sets. + */ + +#ifndef ATTRIBUTES +# define ATTRIBUTES +#endif +#ifndef EXTRACT_VARBITS +# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count)) +#endif +#ifndef EXTRACT_VARBITS8 +# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) +#endif + +static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED +FUNCNAME(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes_avail; + u8 * const out_fastloop_end = + out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); + + /* Input bitstream state; see deflate_decompress.c for documentation */ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + const u8 * const in_fastloop_end = + in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); + bitbuf_t bitbuf = 0; + bitbuf_t saved_bitbuf; + u32 bitsleft = 0; + size_t overread_count = 0; + + bool is_final_block; + unsigned block_type; + unsigned num_litlen_syms; + unsigned num_offset_syms; + bitbuf_t litlen_tablemask; + u32 entry; + +next_block: + /* Starting to read the next block */ + ; + + STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); + REFILL_BITS(); + + /* BFINAL: 1 bit */ + is_final_block = bitbuf & BITMASK(1); + + /* BTYPE: 2 bits */ + block_type = (bitbuf >> 1) & BITMASK(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block */ + + /* The order in which precode lengths are stored */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + unsigned i; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); + num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); + num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); + num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); + + d->static_codes_loaded = false; + + /* + * Read the precode codeword lengths. + * + * A 64-bit bitbuffer is just one bit too small to hold the + * maximum number of precode lens, so to minimize branches we + * merge one len with the previous fields. + */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + d->u.precode_lens[deflate_precode_lens_permutation[0]] = + (bitbuf >> 17) & BITMASK(3); + bitbuf >>= 20; + bitsleft -= 20; + REFILL_BITS(); + i = 1; + do { + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } else { + bitbuf >>= 17; + bitsleft -= 17; + i = 0; + do { + if ((u8)bitsleft < 3) + REFILL_BITS(); + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + /* Decode the litlen and offset codeword lengths. */ + i = 0; + do { + unsigned presym; + u8 rep_val; + unsigned rep_count; + + if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) + REFILL_BITS(); + + /* + * The code below assumes that the precode decode table + * doesn't have any subtables. + */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Decode the next precode symbol. */ + entry = d->u.l.precode_decode_table[ + bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + presym = entry >> 16; + + if (presym < 16) { + /* Explicit codeword length */ + d->u.l.lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* + * Note: we don't need to immediately verify that the + * repeat count doesn't overflow the number of elements, + * since we've sized the lens array to have enough extra + * space to allow for the worst-case overrun (138 zeroes + * when only 1 length was remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times. */ + SAFETY_CHECK(i != 0); + rep_val = d->u.l.lens[i - 1]; + STATIC_ASSERT(3 + BITMASK(2) == 6); + rep_count = 3 + (bitbuf & BITMASK(2)); + bitbuf >>= 2; + bitsleft -= 2; + d->u.l.lens[i + 0] = rep_val; + d->u.l.lens[i + 1] = rep_val; + d->u.l.lens[i + 2] = rep_val; + d->u.l.lens[i + 3] = rep_val; + d->u.l.lens[i + 4] = rep_val; + d->u.l.lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times. */ + STATIC_ASSERT(3 + BITMASK(3) == 10); + rep_count = 3 + (bitbuf & BITMASK(3)); + bitbuf >>= 3; + bitsleft -= 3; + d->u.l.lens[i + 0] = 0; + d->u.l.lens[i + 1] = 0; + d->u.l.lens[i + 2] = 0; + d->u.l.lens[i + 3] = 0; + d->u.l.lens[i + 4] = 0; + d->u.l.lens[i + 5] = 0; + d->u.l.lens[i + 6] = 0; + d->u.l.lens[i + 7] = 0; + d->u.l.lens[i + 8] = 0; + d->u.l.lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times. */ + STATIC_ASSERT(11 + BITMASK(7) == 138); + rep_count = 11 + (bitbuf & BITMASK(7)); + bitbuf >>= 7; + bitsleft -= 7; + memset(&d->u.l.lens[i], 0, + rep_count * sizeof(d->u.l.lens[i])); + i += rep_count; + } + } while (i < num_litlen_syms + num_offset_syms); + + /* Unnecessary, but check this for consistency with zlib. */ + SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); + + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + u16 len, nlen; + + /* + * Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. + */ + + bitsleft -= 3; /* for BTYPE and BFINAL */ + + /* + * Align the bitstream to the next byte boundary. This means + * the next byte boundary as if we were reading a byte at a + * time. Therefore, we have to rewind 'in_next' by any bytes + * that have been refilled but not actually consumed yet (not + * counting overread bytes, which don't increment 'in_next'). + */ + bitsleft = (u8)bitsleft; + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + in_next -= (bitsleft >> 3) - overread_count; + overread_count = 0; + bitbuf = 0; + bitsleft = 0; + + SAFETY_CHECK(in_end - in_next >= 4); + len = get_unaligned_le16(in_next); + nlen = get_unaligned_le16(in_next + 2); + in_next += 4; + + SAFETY_CHECK(len == (u16)~nlen); + if (unlikely(len > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + SAFETY_CHECK(len <= in_end - in_next); + + memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else { + unsigned i; + + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* + * Static Huffman block: build the decode tables for the static + * codes. Skip doing so if the tables are already set up from + * an earlier static block; this speeds up decompression of + * degenerate input of many empty or very short static blocks. + * + * Afterwards, the remainder is the same as decompressing a + * dynamic Huffman block. + */ + + bitbuf >>= 3; /* for BTYPE and BFINAL */ + bitsleft -= 3; + + if (d->static_codes_loaded) + goto have_decode_tables; + + d->static_codes_loaded = true; + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->u.l.lens[i] = 8; + for (; i < 256; i++) + d->u.l.lens[i] = 9; + for (; i < 280; i++) + d->u.l.lens[i] = 7; + for (; i < 288; i++) + d->u.l.lens[i] = 8; + + for (; i < 288 + 32; i++) + d->u.l.lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); +have_decode_tables: + litlen_tablemask = BITMASK(d->litlen_tablebits); + + /* + * This is the "fastloop" for decoding literals and matches. It does + * bounds checks on in_next and out_next in the loop conditions so that + * additional bounds checks aren't needed inside the loop body. + * + * To reduce latency, the bitbuffer is refilled and the next litlen + * decode table entry is preloaded before each loop iteration. + */ + if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) + goto generic_loop; + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + do { + u32 length, offset, lit; + const u8 *src; + u8 *dst; + + /* + * Consume the bits for the litlen decode table entry. Save the + * original bitbuf for later, in case the extra match length + * bits need to be extracted from it. + */ + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + + /* + * Begin by checking for a "fast" literal, i.e. a literal that + * doesn't need a subtable. + */ + if (entry & HUFFDEC_LITERAL) { + /* + * On 64-bit platforms, we decode up to 2 extra fast + * literals in addition to the primary item, as this + * increases performance and still leaves enough bits + * remaining for what follows. We could actually do 3, + * assuming LITLEN_TABLEBITS=11, but that actually + * decreases performance slightly (perhaps by messing + * with the branch prediction of the conditional refill + * that happens later while decoding the match offset). + * + * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN + * and FASTLOOP_MAX_BYTES_READ need to be updated if the + * number of extra literals decoded here is changed. + */ + if (/* enough bits for 2 fast literals + length + offset preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + LENGTH_MAXBITS, + OFFSET_TABLEBITS) && + /* enough bits for 2 fast literals + slow literal + litlen preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS)) { + /* 1st extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* 2nd extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* + * Another fast literal, but + * this one is in lieu of the + * primary item, so it doesn't + * count as one of the extras. + */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + } else { + /* + * Decode a literal. While doing so, preload + * the next litlen decode table entry and refill + * the bitbuffer. To reduce latency, we've + * arranged for there to be enough "preloadable" + * bits remaining to do the table preload + * independently of the refill. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( + LITLEN_TABLEBITS, LITLEN_TABLEBITS)); + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + + /* + * It's not a literal entry, so it can be a length entry, a + * subtable pointer entry, or an end-of-block entry. Detect the + * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Subtable pointer or end-of-block entry */ + + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + + /* + * A subtable is required. Load and consume the + * subtable entry. The subtable entry can be of any + * type: literal, length, or end-of-block. + */ + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + + /* + * 32-bit platforms that use the byte-at-a-time refill + * method have to do a refill here for there to always + * be enough bits to decode a literal that requires a + * subtable, then preload the next litlen decode table + * entry; or to decode a match length that requires a + * subtable, then preload the offset decode table entry. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS) || + !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, + OFFSET_TABLEBITS)) + REFILL_BITS_IN_FASTLOOP(); + if (entry & HUFFDEC_LITERAL) { + /* Decode a literal that required a subtable. */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + /* Else, it's a length that required a subtable. */ + } + + /* + * Decode the match length: the length base value associated + * with the litlen symbol (which we extract from the decode + * table entry), plus the extra length bits. We don't need to + * consume the extra length bits here, as they were included in + * the bits consumed by the entry earlier. We also don't need + * to check for too-long matches here, as this is inside the + * fastloop where it's already been verified that the output + * buffer has enough space remaining to copy a max-length match. + */ + length = entry >> 16; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* + * Decode the match offset. There are enough "preloadable" bits + * remaining to preload the offset decode table entry, but a + * refill might be needed before consuming it. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, + OFFSET_TABLEBITS)); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, + LITLEN_TABLEBITS)) { + /* + * Decoding a match offset on a 64-bit platform. We may + * need to refill once, but then we can decode the whole + * offset and preload the next litlen table entry. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + if (unlikely((u8)bitsleft < OFFSET_MAXBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + } else { + /* Decoding a match offset on a 32-bit platform */ + REFILL_BITS_IN_FASTLOOP(); + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + REFILL_BITS_IN_FASTLOOP(); + /* No further refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME( + OFFSET_MAXBITS - OFFSET_TABLEBITS)); + } else { + /* No refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); + } + } + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + offset = entry >> 16; + offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* Validate the match offset; needed even in the fastloop. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + /* + * Before starting to issue the instructions to copy the match, + * refill the bitbuffer and preload the litlen decode table + * entry for the next loop iteration. This can increase + * performance by allowing the latency of the match copy to + * overlap with these other operations. To further reduce + * latency, we've arranged for there to be enough bits remaining + * to do the table preload independently of the refill, except + * on 32-bit platforms using the byte-at-a-time refill method. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD( + MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, + OFFSET_MAXFASTBITS), + LITLEN_TABLEBITS) && + unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + + /* + * Copy the match. On most CPUs the fastest method is a + * word-at-a-time copy, unconditionally copying about 5 words + * since this is enough for most matches without being too much. + * + * The normal word-at-a-time copy works for offset >= WORDBYTES, + * which is most cases. The case of offset == 1 is also common + * and is worth optimizing for, since it is just RLE encoding of + * the previous byte, which is the result of compressing long + * runs of the same byte. + * + * Writing past the match 'length' is allowed here, since it's + * been ensured there is enough output space left for a slight + * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if + * the maximum possible overrun here is changed. + */ + if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { + machine_word_t v; + + /* + * This part tends to get auto-vectorized, so keep it + * copying a multiple of 16 bytes at a time. + */ + v = (machine_word_t)0x0101010101010101 * src[0]; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST) { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + do { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + } while (dst < out_next); + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } while (in_next < in_fastloop_end && out_next < out_fastloop_end); + + /* + * This is the generic loop for decoding literals and matches. This + * handles cases where in_next and out_next are close to the end of + * their respective buffers. Usually this loop isn't performance- + * critical, as most time is spent in the fastloop above instead. We + * therefore omit some optimizations here in favor of smaller code. + */ +generic_loop: + for (;;) { + u32 length, offset; + const u8 *src; + u8 *dst; + + REFILL_BITS(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + } + length = entry >> 16; + if (entry & HUFFDEC_LITERAL) { + if (unlikely(out_next == out_end)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + *out_next++ = length; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + if (unlikely(length > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + + if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) + REFILL_BITS(); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + if (!CAN_CONSUME(OFFSET_MAXBITS)) + REFILL_BITS(); + } + offset = entry >> 16; + offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); + bitbuf >>= (u8)entry; + bitsleft -= entry; + + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + +block_done: + /* Finished decoding a block */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. */ + + bitsleft = (u8)bitsleft; + + /* + * If any of the implicit appended zero bytes were consumed (not just + * refilled) before hitting end of stream, then the data is bad. + */ + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + + /* Optionally return the actual number of bytes consumed. */ + if (actual_in_nbytes_ret) { + /* Don't count bytes that were refilled but not consumed. */ + in_next -= (bitsleft >> 3) - overread_count; + + *actual_in_nbytes_ret = in_next - (u8 *)in; + } + + /* Optionally return the actual number of bytes written. */ + if (actual_out_nbytes_ret) { + *actual_out_nbytes_ret = out_next - (u8 *)out; + } else { + if (out_next != out_end) + return LIBDEFLATE_SHORT_OUTPUT; + } + return LIBDEFLATE_SUCCESS; +} + +#undef FUNCNAME +#undef ATTRIBUTES +#undef EXTRACT_VARBITS +#undef EXTRACT_VARBITS8 + +#endif /* HAVE_BMI2_INTRIN */ + +#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE +#define DEFAULT_IMPL deflate_decompress_bmi2 +#else +static inline decompress_func_t +arch_select_decompress_func(void) +{ +#ifdef deflate_decompress_bmi2 + if (HAVE_BMI2(get_x86_cpu_features())) + return deflate_decompress_bmi2; +#endif + return NULL; +} +#define arch_select_decompress_func arch_select_decompress_func +#endif + +#endif /* LIB_X86_DECOMPRESS_IMPL_H */ + +#endif + +#ifndef DEFAULT_IMPL +# define DEFAULT_IMPL deflate_decompress_default +#endif + +#ifdef arch_select_decompress_func +static enum libdeflate_result +dispatch_decomp(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + +static volatile decompress_func_t decompress_impl = dispatch_decomp; + +/* Choose the best implementation at runtime. */ +static enum libdeflate_result +dispatch_decomp(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + decompress_func_t f = arch_select_decompress_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + decompress_impl = f; + return f(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +} +#else +/* The best implementation is statically known, so call it directly. */ +# define decompress_impl DEFAULT_IMPL +#endif + +/* + * This is the main DEFLATE decompression routine. See libdeflate.h for the + * documentation. + * + * Note that the real code is in decompress_template.h. The part here just + * handles calling the appropriate implementation depending on the CPU features + * at runtime. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) +{ + return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +} + +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) +{ + return libdeflate_deflate_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); +} + +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options) +{ + struct libdeflate_decompressor *d; + + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + + d = (options->malloc_func ? options->malloc_func : + libdeflate_default_malloc_func)(sizeof(*d)); + if (d == NULL) + return NULL; + /* + * Note that only certain parts of the decompressor actually must be + * initialized here: + * + * - 'static_codes_loaded' must be initialized to false. + * + * - The first half of the main portion of each decode table must be + * initialized to any value, to avoid reading from uninitialized + * memory during table expansion in build_decode_table(). (Although, + * this is really just to avoid warnings with dynamic tools like + * valgrind, since build_decode_table() is guaranteed to initialize + * all entries eventually anyway.) + * + * - 'free_func' must be set. + * + * But for simplicity, we currently just zero the whole decompressor. + */ + memset(d, 0, sizeof(*d)); + d->free_func = options->free_func ? + options->free_func : libdeflate_default_free_func; + return d; +} + +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor(void) +{ + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_decompressor_ex(&defaults); +} + +LIBDEFLATEAPI void +libdeflate_free_decompressor(struct libdeflate_decompressor *d) +{ + if (d) + d->free_func(d); +} + + +/* + * utils.c - utility functions for libdeflate + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef FREESTANDING +# define malloc NULL +# define free NULL +#else +# include +#endif + +malloc_func_t libdeflate_default_malloc_func = malloc; +free_func_t libdeflate_default_free_func = free; + +void * +libdeflate_aligned_malloc(malloc_func_t malloc_func, + size_t alignment, size_t size) +{ + void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size); + + if (ptr) { + void *orig_ptr = ptr; + + ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); + ((void **)ptr)[-1] = orig_ptr; + } + return ptr; +} + +void +libdeflate_aligned_free(free_func_t free_func, void *ptr) +{ + (*free_func)(((void **)ptr)[-1]); +} + +LIBDEFLATEAPI void +libdeflate_set_memory_allocator(malloc_func_t malloc_func, + free_func_t free_func) +{ + libdeflate_default_malloc_func = malloc_func; + libdeflate_default_free_func = free_func; +} + +/* + * Implementations of libc functions for freestanding library builds. + * Normal library builds don't use these. Not optimized yet; usually the + * compiler expands these functions and doesn't actually call them anyway. + */ +#ifdef FREESTANDING +#undef memset +void * __attribute__((weak)) +memset(void *s, int c, size_t n) +{ + u8 *p = s; + size_t i; + + for (i = 0; i < n; i++) + p[i] = c; + return s; +} + +#undef memcpy +void * __attribute__((weak)) +memcpy(void *dest, const void *src, size_t n) +{ + u8 *d = dest; + const u8 *s = src; + size_t i; + + for (i = 0; i < n; i++) + d[i] = s[i]; + return dest; +} + +#undef memmove +void * __attribute__((weak)) +memmove(void *dest, const void *src, size_t n) +{ + u8 *d = dest; + const u8 *s = src; + size_t i; + + if (d <= s) + return memcpy(d, s, n); + + for (i = n; i > 0; i--) + d[i - 1] = s[i - 1]; + return dest; +} + +#undef memcmp +int __attribute__((weak)) +memcmp(const void *s1, const void *s2, size_t n) +{ + const u8 *p1 = s1; + const u8 *p2 = s2; + size_t i; + + for (i = 0; i < n; i++) { + if (p1[i] != p2[i]) + return (int)p1[i] - (int)p2[i]; + } + return 0; +} +#endif /* FREESTANDING */ + +#ifdef LIBDEFLATE_ENABLE_ASSERTIONS +#include +#include +void +libdeflate_assertion_failed(const char *expr, const char *file, int line) +{ + fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line); + abort(); +} +#endif /* LIBDEFLATE_ENABLE_ASSERTIONS */ + +/* + * x86/cpu_features.c - feature detection for x86 CPUs + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#if HAVE_DYNAMIC_X86_CPU_FEATURES + +/* + * With old GCC versions we have to manually save and restore the x86_32 PIC + * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 + */ +#if defined(ARCH_X86_32) && defined(__PIC__) +# define EBX_CONSTRAINT "=&r" +#else +# define EBX_CONSTRAINT "=b" +#endif + +/* Execute the CPUID instruction. */ +static inline void +cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) +{ +#ifdef _MSC_VER + int result[4]; + + __cpuidex(result, leaf, subleaf); + *a = result[0]; + *b = result[1]; + *c = result[2]; + *d = result[3]; +#else + __asm__ volatile(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" + "cpuid \n" + ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" + : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) + : "a" (leaf), "c" (subleaf)); +#endif +} + +/* Read an extended control register. */ +static inline u64 +read_xcr(u32 index) +{ +#ifdef _MSC_VER + return _xgetbv(index); +#else + u32 d, a; + + /* + * Execute the "xgetbv" instruction. Old versions of binutils do not + * recognize this instruction, so list the raw bytes instead. + * + * This must be 'volatile' to prevent this code from being moved out + * from under the check for OSXSAVE. + */ + __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : + "=d" (d), "=a" (a) : "c" (index)); + + return ((u64)d << 32) | a; +#endif +} + +static const struct cpu_feature x86_cpu_feature_table[] = { + {X86_CPU_FEATURE_SSE2, "sse2"}, + {X86_CPU_FEATURE_PCLMUL, "pclmul"}, + {X86_CPU_FEATURE_AVX, "avx"}, + {X86_CPU_FEATURE_AVX2, "avx2"}, + {X86_CPU_FEATURE_BMI2, "bmi2"}, +}; + +volatile u32 libdeflate_x86_cpu_features = 0; + +/* Initialize libdeflate_x86_cpu_features. */ +void libdeflate_init_x86_cpu_features(void) +{ + u32 max_leaf, a, b, c, d; + u64 xcr0 = 0; + u32 features = 0; + + /* EAX=0: Highest Function Parameter and Manufacturer ID */ + cpuid(0, 0, &max_leaf, &b, &c, &d); + if (max_leaf < 1) + goto out; + + /* EAX=1: Processor Info and Feature Bits */ + cpuid(1, 0, &a, &b, &c, &d); + if (d & (1 << 26)) + features |= X86_CPU_FEATURE_SSE2; + if (c & (1 << 1)) + features |= X86_CPU_FEATURE_PCLMUL; + if (c & (1 << 27)) + xcr0 = read_xcr(0); + if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_AVX; + + if (max_leaf < 7) + goto out; + + /* EAX=7, ECX=0: Extended Features */ + cpuid(7, 0, &a, &b, &c, &d); + if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_AVX2; + if (b & (1 << 8)) + features |= X86_CPU_FEATURE_BMI2; + +out: + disable_cpu_features_for_testing(&features, x86_cpu_feature_table, + ARRAY_LEN(x86_cpu_feature_table)); + + libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN; +} + +#endif /* HAVE_DYNAMIC_X86_CPU_FEATURES */ diff --git a/Plugins/nosGeometry/External/openFBX/libdeflate.h b/Plugins/nosGeometry/External/openFBX/libdeflate.h new file mode 100644 index 00000000..382d895d --- /dev/null +++ b/Plugins/nosGeometry/External/openFBX/libdeflate.h @@ -0,0 +1,411 @@ +/* + * libdeflate.h - public header for libdeflate + */ + +#ifndef LIBDEFLATE_H +#define LIBDEFLATE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBDEFLATE_VERSION_MAJOR 1 +#define LIBDEFLATE_VERSION_MINOR 18 +#define LIBDEFLATE_VERSION_STRING "1.18" + +/* + * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause + * __declspec(dllimport) to be used. This should be done when it's easy to do. + * Otherwise it's fine to skip it, since it is a very minor performance + * optimization that is irrelevant for most use cases of libdeflate. + */ +#ifndef LIBDEFLATEAPI +# if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBDEFLATEAPI __declspec(dllimport) +# else +# define LIBDEFLATEAPI +# endif +#endif + +/* ========================================================================== */ +/* Compression */ +/* ========================================================================== */ + +struct libdeflate_compressor; +struct libdeflate_options; + +/* + * libdeflate_alloc_compressor() allocates a new compressor that supports + * DEFLATE, zlib, and gzip compression. 'compression_level' is the compression + * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 = + * medium/default, 9 = slow, 12 = slowest). Level 0 is also supported and means + * "no compression", specifically "create a valid stream, but only emit + * uncompressed blocks" (this will expand the data slightly). + * + * The return value is a pointer to the new compressor, or NULL if out of memory + * or if the compression level is invalid (i.e. outside the range [0, 12]). + * + * Note: for compression, the sliding window size is defined at compilation time + * to 32768, the largest size permissible in the DEFLATE format. It cannot be + * changed at runtime. + * + * A single compressor is not safe to use by multiple threads concurrently. + * However, different threads may use different compressors concurrently. + */ +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor(int compression_level); + +/* + * Like libdeflate_alloc_compressor(), but adds the 'options' argument. + */ +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor_ex(int compression_level, + const struct libdeflate_options *options); + +/* + * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of + * data. It attempts to compress 'in_nbytes' bytes of data located at 'in' and + * write the result to 'out', which has space for 'out_nbytes_avail' bytes. The + * return value is the compressed size in bytes, or 0 if the data could not be + * compressed to 'out_nbytes_avail' bytes or fewer (but see note below). + * + * If compression is successful, then the output data is guaranteed to be a + * valid DEFLATE stream that decompresses to the input data. No other + * guarantees are made about the output data. Notably, different versions of + * libdeflate can produce different compressed data for the same uncompressed + * data, even at the same compression level. Do ***NOT*** do things like + * writing tests that compare compressed data to a golden output, as this can + * break when libdeflate is updated. (This property isn't specific to + * libdeflate; the same is true for zlib and other compression libraries too.) + */ +LIBDEFLATEAPI size_t +libdeflate_deflate_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the + * number of bytes of compressed data that may be produced by compressing any + * buffer of length less than or equal to 'in_nbytes' using + * libdeflate_deflate_compress() with the specified compressor. This bound will + * necessarily be a number greater than or equal to 'in_nbytes'. It may be an + * overestimate of the true upper bound. The return value is guaranteed to be + * the same for all invocations with the same compressor and same 'in_nbytes'. + * + * As a special case, 'compressor' may be NULL. This causes the bound to be + * taken across *any* libdeflate_compressor that could ever be allocated with + * this build of the library, with any options. + * + * Note that this function is not necessary in many applications. With + * block-based compression, it is usually preferable to separately store the + * uncompressed size of each block and to store any blocks that did not compress + * to less than their original size uncompressed. In that scenario, there is no + * need to know the worst-case compressed size, since the maximum number of + * bytes of compressed data that may be used would always be one less than the + * input length. You can just pass a buffer of that size to + * libdeflate_deflate_compress() and store the data uncompressed if + * libdeflate_deflate_compress() returns 0, indicating that the compressed data + * did not fit into the provided output buffer. + */ +LIBDEFLATEAPI size_t +libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * Like libdeflate_deflate_compress(), but uses the zlib wrapper format instead + * of raw DEFLATE. + */ +LIBDEFLATEAPI size_t +libdeflate_zlib_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * Like libdeflate_deflate_compress_bound(), but assumes the data will be + * compressed with libdeflate_zlib_compress() rather than with + * libdeflate_deflate_compress(). + */ +LIBDEFLATEAPI size_t +libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * Like libdeflate_deflate_compress(), but uses the gzip wrapper format instead + * of raw DEFLATE. + */ +LIBDEFLATEAPI size_t +libdeflate_gzip_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * Like libdeflate_deflate_compress_bound(), but assumes the data will be + * compressed with libdeflate_gzip_compress() rather than with + * libdeflate_deflate_compress(). + */ +LIBDEFLATEAPI size_t +libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * libdeflate_free_compressor() frees a compressor that was allocated with + * libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is + * taken. + */ +LIBDEFLATEAPI void +libdeflate_free_compressor(struct libdeflate_compressor *compressor); + +/* ========================================================================== */ +/* Decompression */ +/* ========================================================================== */ + +struct libdeflate_decompressor; +struct libdeflate_options; + +/* + * libdeflate_alloc_decompressor() allocates a new decompressor that can be used + * for DEFLATE, zlib, and gzip decompression. The return value is a pointer to + * the new decompressor, or NULL if out of memory. + * + * This function takes no parameters, and the returned decompressor is valid for + * decompressing data that was compressed at any compression level and with any + * sliding window size. + * + * A single decompressor is not safe to use by multiple threads concurrently. + * However, different threads may use different decompressors concurrently. + */ +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor(void); + +/* + * Like libdeflate_alloc_decompressor(), but adds the 'options' argument. + */ +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options); + +/* + * Result of a call to libdeflate_deflate_decompress(), + * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). + */ +enum libdeflate_result { + /* Decompression was successful. */ + LIBDEFLATE_SUCCESS = 0, + + /* Decompression failed because the compressed data was invalid, + * corrupt, or otherwise unsupported. */ + LIBDEFLATE_BAD_DATA = 1, + + /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have + * decompressed to fewer than 'out_nbytes_avail' bytes. */ + LIBDEFLATE_SHORT_OUTPUT = 2, + + /* The data would have decompressed to more than 'out_nbytes_avail' + * bytes. */ + LIBDEFLATE_INSUFFICIENT_SPACE = 3, +}; + +/* + * libdeflate_deflate_decompress() decompresses a DEFLATE stream from the buffer + * 'in' with compressed size up to 'in_nbytes' bytes. The uncompressed data is + * written to 'out', a buffer with size 'out_nbytes_avail' bytes. If + * decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. Otherwise, + * a nonzero result code such as LIBDEFLATE_BAD_DATA is returned, and the + * contents of the output buffer are undefined. + * + * Decompression stops at the end of the DEFLATE stream (as indicated by the + * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. + * + * libdeflate_deflate_decompress() can be used in cases where the actual + * uncompressed size is known (recommended) or unknown (not recommended): + * + * - If the actual uncompressed size is known, then pass the actual + * uncompressed size as 'out_nbytes_avail' and pass NULL for + * 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail + * with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the + * specified number of bytes. + * + * - If the actual uncompressed size is unknown, then provide a non-NULL + * 'actual_out_nbytes_ret' and provide a buffer with some size + * 'out_nbytes_avail' that you think is large enough to hold all the + * uncompressed data. In this case, if the data decompresses to less than + * or equal to 'out_nbytes_avail' bytes, then + * libdeflate_deflate_decompress() will write the actual uncompressed size + * to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise, + * it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was + * not large enough but no other problems were encountered, or another + * nonzero result code if decompression failed for another reason. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL, + * then the actual compressed size of the DEFLATE stream (aligned to the next + * byte boundary) is written to *actual_in_nbytes_ret. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format + * instead of raw DEFLATE. + * + * Decompression will stop at the end of the zlib stream, even if it is shorter + * than 'in_nbytes'. If you need to know exactly where the zlib stream ended, + * use libdeflate_zlib_decompress_ex(). + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression + * succeeds (indicating that the first zlib-compressed stream in the input + * buffer was decompressed), then the actual number of input bytes consumed is + * written to *actual_in_nbytes_ret. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format + * instead of raw DEFLATE. + * + * If multiple gzip-compressed members are concatenated, then only the first + * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need + * multi-member support. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression + * succeeds (indicating that the first gzip-compressed member in the input + * buffer was decompressed), then the actual number of input bytes consumed is + * written to *actual_in_nbytes_ret. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * libdeflate_free_decompressor() frees a decompressor that was allocated with + * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action + * is taken. + */ +LIBDEFLATEAPI void +libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); + +/* ========================================================================== */ +/* Checksums */ +/* ========================================================================== */ + +/* + * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of + * data and returns the updated checksum. When starting a new checksum, the + * required initial value for 'adler' is 1. This value is also returned when + * 'buffer' is specified as NULL. + */ +LIBDEFLATEAPI uint32_t +libdeflate_adler32(uint32_t adler, const void *buffer, size_t len); + + +/* + * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data + * and returns the updated checksum. When starting a new checksum, the required + * initial value for 'crc' is 0. This value is also returned when 'buffer' is + * specified as NULL. + */ +LIBDEFLATEAPI uint32_t +libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); + +/* ========================================================================== */ +/* Custom memory allocator */ +/* ========================================================================== */ + +/* + * Install a custom memory allocator which libdeflate will use for all memory + * allocations by default. 'malloc_func' is a function that must behave like + * malloc(), and 'free_func' is a function that must behave like free(). + * + * The per-(de)compressor custom memory allocator that can be specified in + * 'struct libdeflate_options' takes priority over this. + * + * This doesn't affect the free() function that will be used to free + * (de)compressors that were already in existence when this is called. + */ +LIBDEFLATEAPI void +libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), + void (*free_func)(void *)); + +/* + * Advanced options. This is the options structure that + * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex() + * require. Most users won't need this and should just use the non-"_ex" + * functions instead. If you do need this, it should be initialized like this: + * + * struct libdeflate_options options; + * + * memset(&options, 0, sizeof(options)); + * options.sizeof_options = sizeof(options); + * // Then set the fields that you need to override the defaults for. + */ +struct libdeflate_options { + + /* + * This field must be set to the struct size. This field exists for + * extensibility, so that fields can be appended to this struct in + * future versions of libdeflate while still supporting old binaries. + */ + size_t sizeof_options; + + /* + * An optional custom memory allocator to use for this (de)compressor. + * 'malloc_func' must be a function that behaves like malloc(), and + * 'free_func' must be a function that behaves like free(). + * + * This is useful in cases where a process might have multiple users of + * libdeflate who want to use different memory allocators. For example, + * a library might want to use libdeflate with a custom memory allocator + * without interfering with user code that might use libdeflate too. + * + * This takes priority over the "global" memory allocator (which by + * default is malloc() and free(), but can be changed by + * libdeflate_set_memory_allocator()). Moreover, libdeflate will never + * call the "global" memory allocator if a per-(de)compressor custom + * allocator is always given. + */ + void *(*malloc_func)(size_t); + void (*free_func)(void *); +}; + +#ifdef __cplusplus +} +#endif + +#endif /* LIBDEFLATE_H */ diff --git a/Plugins/nosGeometry/External/openFBX/ofbx.cpp b/Plugins/nosGeometry/External/openFBX/ofbx.cpp new file mode 100644 index 00000000..f14444e6 --- /dev/null +++ b/Plugins/nosGeometry/External/openFBX/ofbx.cpp @@ -0,0 +1,4102 @@ +#include "ofbx.h" +#include "libdeflate.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __cplusplus >= 202002L +#include // for std::bit_cast (C++20 and later) +#endif +#include + +namespace ofbx +{ + +static int decodeIndex(int idx) +{ + return (idx < 0) ? (-idx - 1) : idx; +} + +static int codeIndex(int idx, bool last) +{ + return last ? (-idx - 1) : idx; +} + +template +static T& emplace_back(std::vector& vec) { + vec.emplace_back(); + return vec.back(); +} + +struct Allocator { + struct Page { + struct { + Page* next = nullptr; + u32 offset = 0; + } header; + u8 data[4096 * 1024 - 12]; + }; + Page* first = nullptr; + + ~Allocator() { + Page* p = first; + while (p) { + Page* n = p->header.next; + delete p; + p = n; + } + } + + template T* allocate(Args&&... args) + { + assert(sizeof(T) <= sizeof(first->data)); + if (!first) { + first = new Page; + } + Page* p = first; + if (p->header.offset % alignof(T) != 0) { + p->header.offset += alignof(T) - p->header.offset % alignof(T); + } + + if (p->header.offset + sizeof(T) > sizeof(p->data)) { + p = new Page; + p->header.next = first; + first = p; + } + T* res = new (p->data + p->header.offset) T(args...); + p->header.offset += sizeof(T); + return res; + } +}; + + +struct Video +{ + IElementProperty* base64_property = nullptr; + DataView filename; + DataView content; + DataView media; + bool is_base_64; +}; + + +struct Error +{ + Error() {} + Error(const char* msg) + { + s_message = msg; + } + + // Format a message with printf-style arguments. + template + Error(const char* fmt, Args... args) + { + char buf[1024]; + std::snprintf(buf, sizeof(buf), fmt, args...); + s_message = buf; + } + + static const char* s_message; +}; + + +const char* Error::s_message = ""; + + +template struct OptionalError +{ + OptionalError(Error error) + : is_error(true) + { + } + + + OptionalError(T _value) + : value(_value) + , is_error(false) + { + } + + + T getValue() const + { +#ifdef _DEBUG + assert(error_checked); +#endif + return value; + } + + + bool isError() + { +#ifdef _DEBUG + error_checked = true; +#endif + return is_error; + } + + +private: + T value; + bool is_error; +#ifdef _DEBUG + bool error_checked = false; +#endif +}; + + +#pragma pack(1) +struct Header +{ + u8 magic[21]; + u8 reserved[2]; + u32 version; +}; +#pragma pack() + + +struct Cursor +{ + const u8* current; + const u8* begin; + const u8* end; +}; + + +static void setTranslation(const DVec3& t, DMatrix* mtx) +{ + mtx->m[12] = t.x; + mtx->m[13] = t.y; + mtx->m[14] = t.z; +} + + +static DVec3 operator-(const DVec3& v) +{ + return {-v.x, -v.y, -v.z}; +} + + +static DMatrix operator*(const DMatrix& lhs, const DMatrix& rhs) +{ + DMatrix res; + for (int j = 0; j < 4; ++j) + { + for (int i = 0; i < 4; ++i) + { + double tmp = 0; + for (int k = 0; k < 4; ++k) + { + tmp += lhs.m[i + k * 4] * rhs.m[k + j * 4]; + } + res.m[i + j * 4] = tmp; + } + } + return res; +} + + +static DMatrix makeIdentity() +{ + return {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1}; +} + + +static DMatrix rotationX(double angle) +{ + DMatrix m = makeIdentity(); + double c = cos(angle); + double s = sin(angle); + + m.m[5] = m.m[10] = c; + m.m[9] = -s; + m.m[6] = s; + + return m; +} + + +static DMatrix rotationY(double angle) +{ + DMatrix m = makeIdentity(); + double c = cos(angle); + double s = sin(angle); + + m.m[0] = m.m[10] = c; + m.m[8] = s; + m.m[2] = -s; + + return m; +} + + +static DMatrix rotationZ(double angle) +{ + DMatrix m = makeIdentity(); + double c = cos(angle); + double s = sin(angle); + + m.m[0] = m.m[5] = c; + m.m[4] = -s; + m.m[1] = s; + + return m; +} + + +static DMatrix getRotationMatrix(const DVec3& euler, RotationOrder order) +{ + const double TO_RAD = 3.1415926535897932384626433832795028 / 180.0; + DMatrix rx = rotationX(euler.x * TO_RAD); + DMatrix ry = rotationY(euler.y * TO_RAD); + DMatrix rz = rotationZ(euler.z * TO_RAD); + switch (order) + { + default: + case RotationOrder::EULER_XYZ: return rz * ry * rx; + case RotationOrder::EULER_XZY: return ry * rz * rx; + case RotationOrder::EULER_YXZ: return rz * rx * ry; + case RotationOrder::EULER_YZX: return rx * rz * ry; + case RotationOrder::EULER_ZXY: return ry * rx * rz; + case RotationOrder::EULER_ZYX: return rx * ry * rz; + case RotationOrder::SPHERIC_XYZ: assert(false); Error::s_message = "Unsupported rotation order."; return rx * ry * rz; + } +} + + +double fbxTimeToSeconds(i64 value) +{ + return double(value) / 46186158000L; +} + + +i64 secondsToFbxTime(double value) +{ + return i64(value * 46186158000L); +} + + +static DVec3 operator*(const DVec3& v, float f) +{ + return {v.x * f, v.y * f, v.z * f}; +} + + +static DVec3 operator+(const DVec3& a, const DVec3& b) +{ + return {a.x + b.x, a.y + b.y, a.z + b.z}; +} + +static FVec3 operator+(const FVec3& a, const FVec3& b) +{ + return {a.x + b.x, a.y + b.y, a.z + b.z}; +} + + +template static bool copyString(char (&destination)[SIZE], const char* source) +{ + const char* src = source; + char* dest = destination; + int length = SIZE; + if (!src) return false; + + while (*src && length > 1) + { + *dest = *src; + --length; + ++dest; + ++src; + } + *dest = 0; + return *src == '\0'; +} + + +u64 DataView::toU64() const +{ + if (is_binary) + { + assert(end - begin == sizeof(u64)); + u64 result; + memcpy(&result, begin, sizeof(u64)); + return result; + } + static_assert(sizeof(unsigned long long) >= sizeof(u64), "can't use strtoull"); + return strtoull((const char*)begin, nullptr, 10); +} + + +i64 DataView::toI64() const +{ + if (is_binary) + { + assert(end - begin == sizeof(i64)); + i64 result; + memcpy(&result, begin, sizeof(i64)); + return result; + } + static_assert(sizeof(long long) >= sizeof(i64), "can't use atoll"); + return atoll((const char*)begin); +} + + +int DataView::toInt() const +{ + if (is_binary) + { + assert(end - begin == sizeof(int)); + int result; + memcpy(&result, begin, sizeof(int)); + return result; + } + return atoi((const char*)begin); +} + + +u32 DataView::toU32() const +{ + if (is_binary) + { + assert(end - begin == sizeof(u32)); + u32 result; + memcpy(&result, begin, sizeof(u32)); + return result; + } + return (u32)atoll((const char*)begin); +} + +bool DataView::toBool() const +{ + return toInt() != 0; +} + + +double DataView::toDouble() const +{ + if (is_binary) + { + assert(end - begin == sizeof(double)); + double result; + memcpy(&result, begin, sizeof(double)); + return result; + } + return atof((const char*)begin); +} + + +float DataView::toFloat() const +{ + if (is_binary) + { + assert(end - begin == sizeof(float)); + float result; + memcpy(&result, begin, sizeof(float)); + return result; + } + return (float)atof((const char*)begin); +} + + +bool DataView::operator==(const char* rhs) const +{ + if (!begin) return !rhs[0]; + const char* c = rhs; + const char* c2 = (const char*)begin; + while (*c && c2 != (const char*)end) + { + if (*c != *c2) return false; + ++c; + ++c2; + } + return *c2 == '\0' || c2 == (const char*)end && *c == '\0'; +} + + +struct Property; +struct Element; + +template static bool parseMemory(const Property& property, T* out, int max_size_bytes); +template static bool parseVecData(Property& property, std::vector* out_vec); +template static bool parseVertexData(const Element& element, const char* name, const char* index_name, T& out, std::vector& jobs); +static bool parseDouble(Property& property, double* out); + +struct ParseDataJob { + using F = bool (*)(Property*, void*); + Property* property = nullptr; + void* data = nullptr; + bool error = false; + F f; +}; + +template [[nodiscard]] bool pushJob(std::vector& jobs, Property& prop, std::vector& data) { + ParseDataJob& job = emplace_back(jobs); + job.property = ∝ + job.data = (void*)&data; + job.f = [](Property* prop, void* data){ return parseVecData(*prop, (std::vector*)data); }; + return true; +} + +struct Property : IElementProperty +{ + Type getType() const override { return (Type)type; } + IElementProperty* getNext() const override { return next; } + DataView getValue() const override { return value; } + int getCount() const override + { + assert(type == ARRAY_DOUBLE || type == ARRAY_INT || type == ARRAY_FLOAT || type == ARRAY_LONG); + if (value.is_binary) + { + int i; + memcpy(&i, value.begin, sizeof(i)); + return i; + } + return count; + } + + bool getValues(double* values, int max_size) const override { return parseMemory(*this, values, max_size); } + + bool getValues(float* values, int max_size) const override { return parseMemory(*this, values, max_size); } + + bool getValues(u64* values, int max_size) const override { return parseMemory(*this, values, max_size); } + + bool getValues(i64* values, int max_size) const override { return parseMemory(*this, values, max_size); } + + bool getValues(int* values, int max_size) const override { return parseMemory(*this, values, max_size); } + + int count = 0; + u8 type = INTEGER; + DataView value; + Property* next = nullptr; +}; + +struct Element : IElement +{ + IElement* getFirstChild() const override { return child; } + IElement* getSibling() const override { return sibling; } + DataView getID() const override { return id; } + IElementProperty* getFirstProperty() const override { return first_property; } + IElementProperty* getProperty(int idx) const + { + IElementProperty* prop = first_property; + for (int i = 0; i < idx; ++i) + { + if (prop == nullptr) return nullptr; + prop = prop->getNext(); + } + return prop; + } + + DataView id; + Element* child = nullptr; + Element* sibling = nullptr; + Property* first_property = nullptr; +}; + + +static const Element* findChild(const Element& element, const char* id) +{ + Element* const* iter = &element.child; + while (*iter) + { + if ((*iter)->id == id) return *iter; + iter = &(*iter)->sibling; + } + return nullptr; +} + + +static IElement* resolveProperty(const Object& obj, const char* name, bool* is_p60) +{ + *is_p60 = false; + const Element* props = findChild((const Element&)obj.element, "Properties70"); + if (!props) { + props = findChild((const Element&)obj.element, "Properties60"); + *is_p60 = true; + if (!props) return nullptr; + } + + Element* prop = props->child; + while (prop) + { + if (prop->first_property && prop->first_property->value == name) + { + return prop; + } + prop = prop->sibling; + } + return nullptr; +} + + +static int resolveEnumProperty(const Object& object, const char* name, int default_value) +{ + bool is_p60; + Element* element = (Element*)resolveProperty(object, name, &is_p60); + if (!element) return default_value; + Property* x = (Property*)element->getProperty(is_p60 ? 3 : 4); + if (!x) return default_value; + + return x->value.toInt(); +} + + +static DVec3 resolveVec3Property(const Object& object, const char* name, const DVec3& default_value) +{ + bool is_p60; + Element* element = (Element*)resolveProperty(object, name, &is_p60); + if (!element) return default_value; + Property* x = (Property*)element->getProperty(is_p60 ? 3 : 4); + if (!x || !x->next || !x->next->next) return default_value; + + return {x->value.toDouble(), x->next->value.toDouble(), x->next->next->value.toDouble()}; +} + +static bool isString(const Property* prop) +{ + if (!prop) return false; + return prop->getType() == Property::STRING; +} + + +static bool isLong(const Property* prop) +{ + if (!prop) return false; + return prop->getType() == Property::LONG; +} + +static bool decompress(const u8* in, size_t in_size, u8* out, size_t out_size) +{ + auto dec = libdeflate_alloc_decompressor(); + size_t dummy; + bool res = libdeflate_deflate_decompress(dec, in + 2, in_size - 2, out, out_size, &dummy) == LIBDEFLATE_SUCCESS; + libdeflate_free_decompressor(dec); + return res; +} + + +template static OptionalError read(Cursor* cursor) +{ + if (cursor->current + sizeof(T) > cursor->end) return Error("Reading past the end"); + T value = *(const T*)cursor->current; + cursor->current += sizeof(T); + return value; +} + + +static OptionalError readShortString(Cursor* cursor) +{ + DataView value; + OptionalError length = read(cursor); + if (length.isError()) return Error(); + + if (cursor->current + length.getValue() > cursor->end) return Error("Reading past the end"); + value.begin = cursor->current; + cursor->current += length.getValue(); + + value.end = cursor->current; + + return value; +} + + +static OptionalError readLongString(Cursor* cursor) +{ + DataView value; + OptionalError length = read(cursor); + if (length.isError()) return Error(); + + if (cursor->current + length.getValue() > cursor->end) return Error("Reading past the end"); + value.begin = cursor->current; + cursor->current += length.getValue(); + + value.end = cursor->current; + + return value; +} + +// Cheat sheet: // +/* +'S': Long string +'Y': 16-bit signed integer +'C': 8-bit signed integer +'I': 32-bit signed integer +'F': Single precision floating-point number +'D': Double precision floating-point number +'L': 64-bit signed integer +'R': Binary data +'b', 'f', 'd', 'l', 'c' and 'i': Arrays of binary data + +Src: https://code.blender.org/2013/08/fbx-binary-file-format-specification/ +*/ + +static OptionalError readProperty(Cursor* cursor, Allocator& allocator) +{ + if (cursor->current == cursor->end) return Error("Reading past the end"); + + Property* prop = allocator.allocate(); + prop->next = nullptr; + prop->type = *cursor->current; + ++cursor->current; + prop->value.begin = cursor->current; + + switch (prop->type) + { + case 'S': + { + OptionalError val = readLongString(cursor); + if (val.isError()) return Error(); + prop->value = val.getValue(); + break; + } + case 'Y': cursor->current += 2; break; + case 'C': cursor->current += 1; break; + case 'I': cursor->current += 4; break; + case 'F': cursor->current += 4; break; + case 'D': cursor->current += 8; break; + case 'L': cursor->current += 8; break; + case 'R': + { + OptionalError len = read(cursor); + if (len.isError()) return Error(); + if (cursor->current + len.getValue() > cursor->end) return Error("Reading past the end"); + cursor->current += len.getValue(); + break; + } + case 'b': + case 'c': + case 'f': + case 'd': + case 'l': + case 'i': + { + OptionalError length = read(cursor); + OptionalError encoding = read(cursor); + OptionalError comp_len = read(cursor); + if (length.isError() || encoding.isError() || comp_len.isError()) return Error(); + if (cursor->current + comp_len.getValue() > cursor->end) return Error("Reading past the end"); + cursor->current += comp_len.getValue(); + break; + } + default: + { + char str[32]; + snprintf(str, sizeof(str), "Unknown property type: %c", prop->type); + return Error(str); + } + } + prop->value.end = cursor->current; + return prop; +} + +static OptionalError readElementOffset(Cursor* cursor, u32 version) +{ + if (version >= 7500) + { + OptionalError tmp = read(cursor); + if (tmp.isError()) return Error(); + return tmp.getValue(); + } + + OptionalError tmp = read(cursor); + if (tmp.isError()) return Error(); + return tmp.getValue(); +} + + +static OptionalError readElement(Cursor* cursor, u32 version, Allocator& allocator) +{ + OptionalError end_offset = readElementOffset(cursor, version); + if (end_offset.isError()) return Error(); + if (end_offset.getValue() == 0) return nullptr; + + OptionalError prop_count = readElementOffset(cursor, version); + OptionalError prop_length = readElementOffset(cursor, version); + if (prop_count.isError() || prop_length.isError()) return Error(); + + OptionalError id = readShortString(cursor); + if (id.isError()) return Error(); + + Element* element = allocator.allocate(); + element->first_property = nullptr; + element->id = id.getValue(); + + element->child = nullptr; + element->sibling = nullptr; + + Property** prop_link = &element->first_property; + for (u32 i = 0; i < prop_count.getValue(); ++i) + { + OptionalError prop = readProperty(cursor, allocator); + if (prop.isError()) + { + return Error(); + } + + *prop_link = prop.getValue(); + prop_link = &(*prop_link)->next; + } + + if (cursor->current - cursor->begin >= (ptrdiff_t)end_offset.getValue()) return element; + + int BLOCK_SENTINEL_LENGTH = version >= 7500 ? 25 : 13; + + Element** link = &element->child; + while (cursor->current - cursor->begin < ((ptrdiff_t)end_offset.getValue() - BLOCK_SENTINEL_LENGTH)) + { + OptionalError child = readElement(cursor, version, allocator); + if (child.isError()) + { + return Error(); + } + + *link = child.getValue(); + if (child.getValue() == 0) break; + link = &(*link)->sibling; + } + + if (cursor->current + BLOCK_SENTINEL_LENGTH > cursor->end) + { + return Error("Reading past the end"); + } + + cursor->current += BLOCK_SENTINEL_LENGTH; + return element; +} + + +static bool isEndLine(const Cursor& cursor) +{ + return *cursor.current == '\n' || *cursor.current == '\r' && cursor.current + 1 < cursor.end && *(cursor.current + 1) != '\n'; +} + + +static void skipInsignificantWhitespaces(Cursor* cursor) +{ + while (cursor->current < cursor->end && isspace(*cursor->current) && !isEndLine(*cursor)) + { + ++cursor->current; + } +} + + +static void skipLine(Cursor* cursor) +{ + while (cursor->current < cursor->end && !isEndLine(*cursor)) + { + ++cursor->current; + } + if (cursor->current < cursor->end) ++cursor->current; + skipInsignificantWhitespaces(cursor); +} + + +static void skipWhitespaces(Cursor* cursor) +{ + while (cursor->current < cursor->end && isspace(*cursor->current)) + { + ++cursor->current; + } + while (cursor->current < cursor->end && *cursor->current == ';') skipLine(cursor); +} + + +static bool isTextTokenChar(char c) +{ + return isalnum(c) || c == '_' || c == '-'; +} + + +static DataView readTextToken(Cursor* cursor) +{ + DataView ret; + ret.begin = cursor->current; + while (cursor->current < cursor->end && isTextTokenChar(*cursor->current)) + { + ++cursor->current; + } + ret.end = cursor->current; + return ret; +} + + +static OptionalError readTextProperty(Cursor* cursor, Allocator& allocator) +{ + Property* prop = allocator.allocate(); + prop->value.is_binary = false; + prop->next = nullptr; + if (*cursor->current == '"') + { + prop->type = 'S'; + ++cursor->current; + prop->value.begin = cursor->current; + while (cursor->current < cursor->end && *cursor->current != '"') + { + ++cursor->current; + } + prop->value.end = cursor->current; + if (cursor->current < cursor->end) ++cursor->current; // skip '"' + return prop; + } + + if (isdigit(*cursor->current) || *cursor->current == '-') + { + prop->type = 'L'; + prop->value.begin = cursor->current; + if (*cursor->current == '-') ++cursor->current; + while (cursor->current < cursor->end && isdigit(*cursor->current)) + { + ++cursor->current; + } + prop->value.end = cursor->current; + + if (cursor->current < cursor->end && *cursor->current == '.') + { + prop->type = 'D'; + ++cursor->current; + while (cursor->current < cursor->end && isdigit(*cursor->current)) + { + ++cursor->current; + } + if (cursor->current < cursor->end && (*cursor->current == 'e' || *cursor->current == 'E')) + { + // 10.5e-013 + ++cursor->current; + if (cursor->current < cursor->end && *cursor->current == '-') ++cursor->current; + while (cursor->current < cursor->end && isdigit(*cursor->current)) ++cursor->current; + } + + + prop->value.end = cursor->current; + } + else if (cursor->current < cursor->end && (*cursor->current == 'e' || *cursor->current == 'E')) { + prop->type = 'D'; + // 10e-013 + ++cursor->current; + if (cursor->current < cursor->end && *cursor->current == '-') ++cursor->current; + while (cursor->current < cursor->end && isdigit(*cursor->current)) ++cursor->current; + prop->value.end = cursor->current; + } + return prop; + } + + if (*cursor->current == 'T' || *cursor->current == 'Y' || *cursor->current == 'W' || *cursor->current == 'C') + { + // WTF is this + prop->type = *cursor->current; + prop->value.begin = cursor->current; + ++cursor->current; + prop->value.end = cursor->current; + return prop; + } + + if (*cursor->current == ',') { + // https://github.com/nem0/OpenFBX/issues/85 + prop->type = IElementProperty::NONE; + prop->value.begin = cursor->current; + prop->value.end = cursor->current; + return prop; + } + + if (*cursor->current == '*') + { + prop->type = 'l'; + ++cursor->current; + // Vertices: *10740 { a: 14.2760353088379,... } + while (cursor->current < cursor->end && *cursor->current != ':') + { + ++cursor->current; + } + if (cursor->current < cursor->end) ++cursor->current; // skip ':' + skipInsignificantWhitespaces(cursor); + prop->value.begin = cursor->current; + prop->count = 0; + bool is_any = false; + while (cursor->current < cursor->end && *cursor->current != '}') + { + if (*cursor->current == ',') + { + if (is_any) ++prop->count; + is_any = false; + } + else if (!isspace(*cursor->current) && !isEndLine(*cursor)) + is_any = true; + if (*cursor->current == '.') prop->type = 'd'; + ++cursor->current; + } + if (is_any) ++prop->count; + prop->value.end = cursor->current; + if (cursor->current < cursor->end) ++cursor->current; // skip '}' + return prop; + } + + assert(false); + return Error("Unknown error"); +} + + +static OptionalError readTextElement(Cursor* cursor, Allocator& allocator) +{ + DataView id = readTextToken(cursor); + if (cursor->current == cursor->end) return Error("Unexpected end of file"); + if (*cursor->current != ':') return Error("Unexpected character"); + ++cursor->current; + + skipInsignificantWhitespaces(cursor); + if (cursor->current == cursor->end) return Error("Unexpected end of file"); + + Element* element = allocator.allocate(); + element->id = id; + + Property** prop_link = &element->first_property; + while (cursor->current < cursor->end && !isEndLine(*cursor) && *cursor->current != '{') + { + OptionalError prop = readTextProperty(cursor, allocator); + if (prop.isError()) + { + return Error(); + } + if (cursor->current < cursor->end && *cursor->current == ',') + { + ++cursor->current; + skipWhitespaces(cursor); + } + skipInsignificantWhitespaces(cursor); + + *prop_link = prop.getValue(); + prop_link = &(*prop_link)->next; + } + + Element** link = &element->child; + if (*cursor->current == '{') + { + ++cursor->current; + skipWhitespaces(cursor); + while (cursor->current < cursor->end && *cursor->current != '}') + { + OptionalError child = readTextElement(cursor, allocator); + if (child.isError()) + { + return Error(); + } + skipWhitespaces(cursor); + + *link = child.getValue(); + link = &(*link)->sibling; + } + if (cursor->current < cursor->end) ++cursor->current; // skip '}' + } + return element; +} + + +static OptionalError tokenizeText(const u8* data, size_t size, Allocator& allocator) +{ + Cursor cursor; + cursor.begin = data; + cursor.current = data; + cursor.end = data + size; + + Element* root = allocator.allocate(); + root->first_property = nullptr; + root->id.begin = nullptr; + root->id.end = nullptr; + root->child = nullptr; + root->sibling = nullptr; + + Element** element = &root->child; + while (cursor.current < cursor.end) + { + if (*cursor.current == ';' || *cursor.current == '\r' || *cursor.current == '\n') + { + skipLine(&cursor); + } + else + { + OptionalError child = readTextElement(&cursor, allocator); + if (child.isError()) + { + return Error(); + } + *element = child.getValue(); + if (!*element) return root; + element = &(*element)->sibling; + } + } + + return root; +} + + +static OptionalError tokenize(const u8* data, size_t size, u32& version, Allocator& allocator) { + if (size < sizeof(Header)) return Error("Invalid header"); + + Cursor cursor; + cursor.begin = data; + cursor.current = data; + cursor.end = data + size; + +#if __cplusplus >= 202002L + const Header* header = std::bit_cast(cursor.current); +#else + Header header_temp; + memcpy(&header_temp, cursor.current, sizeof(Header)); + const Header* header = &header_temp; +#endif + + cursor.current += sizeof(Header); + version = header->version; + + Element* root = allocator.allocate(); + root->first_property = nullptr; + root->id.begin = nullptr; + root->id.end = nullptr; + root->child = nullptr; + root->sibling = nullptr; + + Element** element = &root->child; + for (;;) + { + OptionalError child = readElement(&cursor, header->version, allocator); + if (child.isError()) + { + return Error(); + } + + *element = child.getValue(); + if (!*element) return root; + element = &(*element)->sibling; + } +} + +static void parseTemplates(const Element& root) +{ + const Element* defs = findChild(root, "Definitions"); + if (!defs) return; + + std::unordered_map templates; + Element* def = defs->child; + while (def) + { + if (def->id == "ObjectType") + { + Element* subdef = def->child; + while (subdef) + { + if (subdef->id == "PropertyTemplate") + { + DataView prop1 = def->first_property->value; + DataView prop2 = subdef->first_property->value; + std::string key((const char*)prop1.begin, prop1.end - prop1.begin); + key += std::string((const char*)prop1.begin, prop1.end - prop1.begin); + templates[key] = subdef; + } + subdef = subdef->sibling; + } + } + def = def->sibling; + } + // TODO +} + + +struct Scene; + +enum class VertexDataMapping { + BY_POLYGON_VERTEX, + BY_POLYGON, + BY_VERTEX +}; + +struct Vec2AttributesImpl { + std::vector values; + std::vector indices; + VertexDataMapping mapping; + operator Vec2Attributes() const { + return { values.data(), indices.data(), int(indices.empty() ? values.size() : indices.size()) }; + } +}; + +struct Vec3AttributesImpl { + std::vector values; + std::vector indices; + VertexDataMapping mapping; + operator Vec3Attributes() const { + return { values.data(), indices.data(), int(indices.empty() ? values.size() : indices.size()), int(values.size()) }; + } +}; + +struct Vec4AttributesImpl { + std::vector values; + std::vector indices; + VertexDataMapping mapping; + operator Vec4Attributes() const { + return { values.data(), indices.data(), int(indices.empty() ? values.size() : indices.size()) }; + } +}; + +struct GeometryPartitionImpl { + std::vector polygons; + int max_polygon_triangles = 0; + int triangles_count = 0; +}; + +struct GeometryDataImpl : GeometryData { + Vec3AttributesImpl positions; + Vec3AttributesImpl normals; + Vec3AttributesImpl tangents; + Vec4AttributesImpl colors; + Vec2AttributesImpl uvs[Geometry::s_uvs_max]; + std::vector partitions; + + std::vector materials; + + template + T patchAttributes(const S& attr) const { + T res = attr; + if (!attr.values.empty() && attr.mapping == VertexDataMapping::BY_VERTEX && attr.indices.empty()) { + res.indices = positions.indices.data(); + } + return res; + } + + Vec3Attributes getPositions() const override { return positions; } + Vec3Attributes getNormals() const override { return patchAttributes(normals); } + Vec2Attributes getUVs(int index) const override { return patchAttributes(uvs[index]); } + Vec4Attributes getColors() const override { return patchAttributes(colors); } + Vec3Attributes getTangents() const override { return patchAttributes(tangents); } + int getPartitionCount() const override { return (int)partitions.size(); } + + GeometryPartition getPartition(int index) const override { + if (index >= partitions.size()) return {nullptr, 0, 0, 0}; + return { + partitions[index].polygons.data(), + int(partitions[index].polygons.size()), + partitions[index].max_polygon_triangles, + partitions[index].triangles_count + }; + } + + template + bool postprocess(T& attr) { + if (attr.values.empty()) return true; + if (attr.mapping == VertexDataMapping::BY_VERTEX && !attr.indices.empty()) { + if (positions.indices.empty()) return false; // not supported + + std::vector remapped; + attr.mapping = VertexDataMapping::BY_POLYGON_VERTEX; + remapped.resize(positions.indices.size()); + for (int i = 0; i < remapped.size(); ++i) { + remapped[i] = attr.indices[decodeIndex(positions.indices[i])]; + } + attr.indices = remapped; + } + else if (attr.mapping == VertexDataMapping::BY_POLYGON) { + if (!attr.indices.empty()) return false; // not supported + if (partitions.size() != 1) return false; // not supported + if (partitions[0].polygons.size() != attr.values.size()) return false; // invalid + + std::vector remapped; + attr.mapping = VertexDataMapping::BY_POLYGON_VERTEX; + remapped.resize(positions.indices.size()); + + for (int i = 0, c = (int)partitions[0].polygons.size(); i < c; ++i) { + GeometryPartition::Polygon& polygon = partitions[0].polygons[i]; + for (int j = polygon.from_vertex; j < polygon.from_vertex + polygon.vertex_count; ++j) { + remapped[j] = i; + } + } + attr.indices = remapped; + } + return true; + } + + bool postprocess() { + if (materials.empty()) { + GeometryPartitionImpl& partition = emplace_back(partitions); + int polygon_count = 0; + for (int i : positions.indices) { + if (i < 0) ++polygon_count; + } + partition.polygons.reserve(polygon_count); + int polygon_start = 0; + int max_polygon_triangles = 0; + int total_triangles = 0; + int* indices = positions.indices.data(); + for (int i = 0, c = (int)positions.indices.size(); i < c; ++i) { + if (indices[i] < 0) { + int vertex_count = i - polygon_start + 1; + if (vertex_count > 2) { + partition.polygons.push_back({polygon_start, vertex_count}); + indices[i] = -indices[i] - 1; + int triangles = vertex_count - 2; + total_triangles += triangles; + if (triangles > max_polygon_triangles) max_polygon_triangles = triangles; + } + polygon_start = i + 1; + } + } + partition.max_polygon_triangles = max_polygon_triangles; + partition.triangles_count = total_triangles; + } + else { + int max_partition = 0; + for (int m : materials) { + if (m > max_partition) max_partition = m; + } + partitions.resize(max_partition + 1); + + u32 polygon_idx = 0; + int* indices = positions.indices.data(); + int num_polygon_vertices = 0; + int polygon_start = 0; + for (int i = 0, c = (int)positions.indices.size(); i < c; ++i) { + ++num_polygon_vertices; + if (indices[i] < 0) { + u32 material_index = materials[polygon_idx]; + GeometryPartitionImpl& partition = partitions[material_index]; + partition.polygons.push_back({polygon_start, num_polygon_vertices}); + + int triangles = num_polygon_vertices - 2; + partition.triangles_count += triangles; + if (triangles > partition.max_polygon_triangles) partition.max_polygon_triangles = triangles; + + indices[i] = -indices[i] - 1; + + polygon_start = i + 1; + ++polygon_idx; + num_polygon_vertices = 0; + } + } + } + + postprocess(normals); + postprocess(tangents); + for (Vec2AttributesImpl& uv : uvs) postprocess(uv); + postprocess(colors); + + return true; + } +}; + + +Mesh::Mesh(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + +struct GeometryImpl : Geometry, GeometryDataImpl { + const Skin* skin = nullptr; + const BlendShape* blendShape = nullptr; + + GeometryImpl(const Scene& _scene, const IElement& _element) + : Geometry(_scene, _element) + { + } + + Type getType() const override { return Type::GEOMETRY; } + const GeometryData& getGeometryData() const override { return *this; } + const Skin* getSkin() const override { return skin; } + const BlendShape* getBlendShape() const override { return blendShape; } +}; + +struct MeshImpl : Mesh +{ + MeshImpl(const Scene& _scene, const IElement& _element) + : Mesh(_scene, _element) + { + is_node = true; + } + + + DMatrix getGeometricMatrix() const override + { + DVec3 translation = resolveVec3Property(*this, "GeometricTranslation", {0, 0, 0}); + DVec3 rotation = resolveVec3Property(*this, "GeometricRotation", {0, 0, 0}); + DVec3 scale = resolveVec3Property(*this, "GeometricScaling", {1, 1, 1}); + + DMatrix scale_mtx = makeIdentity(); + scale_mtx.m[0] = (float)scale.x; + scale_mtx.m[5] = (float)scale.y; + scale_mtx.m[10] = (float)scale.z; + DMatrix mtx = getRotationMatrix(rotation, RotationOrder::EULER_XYZ); + setTranslation(translation, &mtx); + + return scale_mtx * mtx; + } + + Type getType() const override { return Type::MESH; } + + const Pose* getPose() const override { return pose; } + const Geometry* getGeometry() const override { return geometry; } + const Material* getMaterial(int index) const override { return materials[index]; } + int getMaterialCount() const override { return (int)materials.size(); } + + const GeometryData& getGeometryData() const override { return geometry ? static_cast(*geometry) : geometry_data; } + const Skin* getSkin() const override { return geometry ? geometry->getSkin() : skin; } + const BlendShape* getBlendShape() const override { return geometry ? geometry->getBlendShape() : blendShape; } + + const Pose* pose = nullptr; + const GeometryImpl* geometry = nullptr; + std::vector materials; + const Skin* skin = nullptr; + const BlendShape* blendShape = nullptr; + + // old formats do not use Geometry nodes but embed vertex data directly in Mesh + GeometryDataImpl geometry_data; +}; + + +Material::Material(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct MaterialImpl : Material +{ + MaterialImpl(const Scene& _scene, const IElement& _element) + : Material(_scene, _element) + { + for (const Texture*& tex : textures) tex = nullptr; + } + + Type getType() const override { return Type::MATERIAL; } + + const Texture* getTexture(Texture::TextureType type) const override { return textures[type]; } + Color getDiffuseColor() const override { return diffuse_color; } + Color getSpecularColor() const override { return specular_color; } + Color getReflectionColor() const override { return reflection_color; }; + Color getAmbientColor() const override { return ambient_color; }; + Color getEmissiveColor() const override { return emissive_color; }; + + double getDiffuseFactor() const override { return diffuse_factor; }; + double getSpecularFactor() const override { return specular_factor; }; + double getReflectionFactor() const override { return reflection_factor; }; + double getShininess() const override { return shininess; }; + double getShininessExponent() const override { return shininess_exponent; }; + double getAmbientFactor() const override { return ambient_factor; }; + double getBumpFactor() const override { return bump_factor; }; + double getEmissiveFactor() const override { return emissive_factor; }; + + const Texture* textures[Texture::TextureType::COUNT]; + Color diffuse_color; + Color specular_color; + Color reflection_color; + Color ambient_color; + Color emissive_color; + + double diffuse_factor; + double specular_factor; + double reflection_factor; + double shininess; + double shininess_exponent; + double ambient_factor; + double bump_factor; + double emissive_factor; + }; + + +struct LimbNodeImpl : Object +{ + LimbNodeImpl(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) + { + is_node = true; + } + Type getType() const override { return Type::LIMB_NODE; } +}; + + +struct NullImpl : Object +{ + NullImpl(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) + { + is_node = true; + } + Type getType() const override { return Type::NULL_NODE; } +}; + + +NodeAttribute::NodeAttribute(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct NodeAttributeImpl : NodeAttribute +{ + NodeAttributeImpl(const Scene& _scene, const IElement& _element) + : NodeAttribute(_scene, _element) + { + } + Type getType() const override { return Type::NODE_ATTRIBUTE; } + DataView getAttributeType() const override { return attribute_type; } + + + DataView attribute_type; +}; + + +Geometry::Geometry(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +Shape::Shape(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct ShapeImpl : Shape { + std::vector vertices; + std::vector normals; + std::vector indices; + + ShapeImpl(const Scene& _scene, const IElement& _element) + : Shape(_scene, _element) + {} + + bool postprocess(GeometryImpl& geom, Allocator& allocator); + + Type getType() const override { return Type::SHAPE; } + int getVertexCount() const override { return (int)vertices.size(); } + int getIndexCount() const override { return (int)indices.size(); } + const Vec3* getVertices() const override { return &vertices[0]; } + const Vec3* getNormals() const override { return normals.empty() ? nullptr : &normals[0]; } + const int* getIndices() const override { return indices.empty() ? nullptr : &indices[0]; } +}; + + +Cluster::Cluster(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct ClusterImpl : Cluster +{ + ClusterImpl(const Scene& _scene, const IElement& _element) + : Cluster(_scene, _element) + { + } + + const int* getIndices() const override { return &indices[0]; } + int getIndicesCount() const override { return (int)indices.size(); } + const double* getWeights() const override { return &weights[0]; } + int getWeightsCount() const override { return (int)weights.size(); } + DMatrix getTransformMatrix() const override { return transform_matrix; } + DMatrix getTransformLinkMatrix() const override { return transform_link_matrix; } + Object* getLink() const override { return link; } + + bool postprocess() { + assert(skin); + + GeometryDataImpl* geom = static_cast(static_cast(skin->resolveObjectLinkReverse(Object::Type::GEOMETRY))); + if (!geom) { + MeshImpl* mesh = (MeshImpl*)skin->resolveObjectLinkReverse(Object::Type::MESH); + if(!mesh) return false; + geom = &mesh->geometry_data; + } + + const Element* indexes = findChild((const Element&)element, "Indexes"); + if (indexes && indexes->first_property) + { + if (!parseVecData(*indexes->first_property, &indices)) return false; + } + + const Element* weights_el = findChild((const Element&)element, "Weights"); + if (weights_el && weights_el->first_property) + { + if (!parseVecData(*weights_el->first_property, &weights)) return false; + } + + return true; + } + + + Object* link = nullptr; + Skin* skin = nullptr; + std::vector indices; + std::vector weights; + DMatrix transform_matrix; + DMatrix transform_link_matrix; + Type getType() const override { return Type::CLUSTER; } +}; + + +AnimationStack::AnimationStack(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +AnimationLayer::AnimationLayer(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +AnimationCurve::AnimationCurve(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +AnimationCurveNode::AnimationCurveNode(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct AnimationStackImpl : AnimationStack +{ + AnimationStackImpl(const Scene& _scene, const IElement& _element) + : AnimationStack(_scene, _element) + { + } + + + const AnimationLayer* getLayer(int index) const override + { + return resolveObjectLink(index); + } + + + Type getType() const override { return Type::ANIMATION_STACK; } +}; + + +struct AnimationCurveImpl : AnimationCurve +{ + AnimationCurveImpl(const Scene& _scene, const IElement& _element) + : AnimationCurve(_scene, _element) + { + } + + int getKeyCount() const override { return (int)times.size(); } + const i64* getKeyTime() const override { return ×[0]; } + const float* getKeyValue() const override { return &values[0]; } + + std::vector times; + std::vector values; + Type getType() const override { return Type::ANIMATION_CURVE; } +}; + + +Skin::Skin(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct SkinImpl : Skin +{ + SkinImpl(const Scene& _scene, const IElement& _element) + : Skin(_scene, _element) + { + } + + int getClusterCount() const override { return (int)clusters.size(); } + const Cluster* getCluster(int idx) const override { return clusters[idx]; } + + Type getType() const override { return Type::SKIN; } + + std::vector clusters; +}; + + +BlendShapeChannel::BlendShapeChannel(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct BlendShapeChannelImpl : BlendShapeChannel +{ + BlendShapeChannelImpl(const Scene& _scene, const IElement& _element) + : BlendShapeChannel(_scene, _element) + { + } + + double getDeformPercent() const override { return deformPercent; } + int getShapeCount() const override { return (int)shapes.size(); } + const Shape* getShape(int idx) const override { return shapes[idx]; } + + Type getType() const override { return Type::BLEND_SHAPE_CHANNEL; } + + bool postprocess(Allocator& allocator) { + assert(blendShape); + + GeometryImpl* geom = (GeometryImpl*)blendShape->resolveObjectLinkReverse(Object::Type::GEOMETRY); + if (!geom) return false; + + const Element* deform_percent_el = findChild((const Element&)element, "DeformPercent"); + if (deform_percent_el && deform_percent_el->first_property) + { + if (!parseDouble(*deform_percent_el->first_property, &deformPercent)) return false; + } + + const Element* full_weights_el = findChild((const Element&)element, "FullWeights"); + if (full_weights_el && full_weights_el->first_property) + { + if (!parseVecData(*full_weights_el->first_property, &fullWeights)) return false; + } + + for (int i = 0; i < (int)shapes.size(); i++) + { + auto shape = (ShapeImpl*)shapes[i]; + if (!shape->postprocess(*geom, allocator)) return false; + } + + return true; + } + + + BlendShape* blendShape = nullptr; + double deformPercent = 0; + std::vector fullWeights; + std::vector shapes; +}; + + +BlendShape::BlendShape(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct BlendShapeImpl : BlendShape +{ + BlendShapeImpl(const Scene& _scene, const IElement& _element) + : BlendShape(_scene, _element) + { + } + + int getBlendShapeChannelCount() const override { return (int)blendShapeChannels.size(); } + const BlendShapeChannel* getBlendShapeChannel(int idx) const override { return blendShapeChannels[idx]; } + + Type getType() const override { return Type::BLEND_SHAPE; } + + std::vector blendShapeChannels; +}; + + +Texture::Texture(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +Pose::Pose(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) +{ +} + + +struct PoseImpl : Pose +{ + PoseImpl(const Scene& _scene, const IElement& _element) + : Pose(_scene, _element) + {} + + bool postprocess(Scene& scene); + DMatrix getMatrix() const override { return matrix; } + const Object* getNode() const override { return node; } + Type getType() const override { return Type::POSE; } + + DMatrix matrix; + Object* node = nullptr; + u64 node_id; +}; + + +struct TextureImpl : Texture +{ + TextureImpl(const Scene& _scene, const IElement& _element) + : Texture(_scene, _element) + { + } + + DataView getRelativeFileName() const override { return relative_filename; } + DataView getFileName() const override { return filename; } + DataView getEmbeddedData() const override; + + DataView media; + DataView filename; + DataView relative_filename; + Type getType() const override { return Type::TEXTURE; } +}; + +struct LightImpl : Light +{ + LightImpl(const Scene& _scene, const IElement& _element) + : Light(_scene, _element) + { + } + + Type getType() const override { return Type::LIGHT; } + LightType getLightType() const override { return lightType; } + + bool doesCastLight() const override { return castLight; } + + bool doesDrawVolumetricLight() const override + { + // Return the draw volumetric light property based on the stored data (WIP) + return false; + } + + bool doesDrawGroundProjection() const override + { + // Return the draw ground projection property based on the stored data (WIP) + return false; + } + + bool doesDrawFrontFacingVolumetricLight() const override + { + // Return the draw front-facing volumetric light property based on the stored data (WIP) + return false; + } + + Color getColor() const override { return color; } + double getIntensity() const override { return intensity; } + double getInnerAngle() const override { return innerAngle; } + double getOuterAngle() const override { return outerAngle; } + + double getFog() const override { return fog; } + + DecayType getDecayType() const override { return decayType; } + double getDecayStart() const override { return decayStart; } + + // Near attenuation + bool doesEnableNearAttenuation() const override { return enableNearAttenuation; } + double getNearAttenuationStart() const override { return nearAttenuationStart; } + double getNearAttenuationEnd() const override { return nearAttenuationEnd; } + + // Far attenuation + bool doesEnableFarAttenuation() const override { return enableFarAttenuation; } + double getFarAttenuationStart() const override { return farAttenuationStart; } + double getFarAttenuationEnd() const override { return farAttenuationEnd; } + + // Shadows + const Texture* getShadowTexture() const override { return shadowTexture; } + bool doesCastShadows() const override { return castShadows; } + Color getShadowColor() const override { return shadowColor; } + + // Member variables to store light properties + //------------------------------------------------------------------------- + LightType lightType = LightType::POINT; + bool castLight = true; + Color color = {1, 1, 1}; // Light color (RGB values) + double intensity = 100.0; + + double innerAngle = 0.0; + double outerAngle = 45.0; + + double fog = 50; + + DecayType decayType = DecayType::QUADRATIC; + double decayStart = 1.0; + + bool enableNearAttenuation = false; + double nearAttenuationStart = 0.0; + double nearAttenuationEnd = 0.0; + + bool enableFarAttenuation = false; + double farAttenuationStart = 0.0; + double farAttenuationEnd = 0.0; + + const Texture* shadowTexture = nullptr; + bool castShadows = true; + Color shadowColor = {0, 0, 0}; +}; + +static float OFBX_PI = 3.14159265358979323846f; +struct CameraImpl : public Camera +{ + CameraImpl(const Scene& _scene, const IElement& _element) + : Camera(_scene, _element) + { + } + + ProjectionType projectionType = ProjectionType::PERSPECTIVE; + ApertureMode apertureMode = ApertureMode::HORIZONTAL; // Used to determine the FOV + + double filmHeight = 36.0; + double filmWidth = 24.0; + + double aspectHeight = 1.0; + double aspectWidth = 1.0; + + double nearPlane = 0.1; + double farPlane = 1000.0; + bool autoComputeClipPanes = true; + + GateFit gateFit = GateFit::HORIZONTAL; + double filmAspectRatio = 1.0; + double focalLength = 50.0; + double focusDistance = 50.0; + + DVec3 backgroundColor = {0, 0, 0}; + DVec3 interestPosition = {0, 0, 0}; + + double fieldOfView = 60.0; + + Type getType() const override { return Type::CAMERA; } + ProjectionType getProjectionType() const override { return projectionType; } + ApertureMode getApertureMode() const override { return apertureMode; } + + double getFilmHeight() const override { return filmHeight; } + double getFilmWidth() const override { return filmWidth; } + + double getAspectHeight() const override { return aspectHeight; } + double getAspectWidth() const override { return aspectWidth; } + + double getNearPlane() const override { return nearPlane; } + double getFarPlane() const override { return farPlane; } + bool doesAutoComputeClipPanes() const override { return autoComputeClipPanes; } + + GateFit getGateFit() const override { return gateFit; } + double getFilmAspectRatio() const override { return filmAspectRatio; } + double getFocalLength() const override { return focalLength; } + double getFocusDistance() const override { return focusDistance; } + + DVec3 getBackgroundColor() const override { return backgroundColor; } + DVec3 getInterestPosition() const override { return interestPosition; } + + void CalculateFOV() + { + switch (apertureMode) + { + case Camera::ApertureMode::HORIZONTAL: + fieldOfView = 2.0 * atan(filmWidth / (2.0 * focalLength)) * 180.0 / OFBX_PI; + return; + case Camera::ApertureMode::VERTICAL: + fieldOfView = 2.0 * atan(filmHeight / (2.0 * focalLength)) * 180.0 / OFBX_PI; + return; + case Camera::ApertureMode::HORIZANDVERT: + fieldOfView = 2.0 * atan(sqrt(filmWidth * filmWidth + filmHeight * filmHeight) / (2.0 * focalLength)) * 180.0 / OFBX_PI; + return; + case Camera::ApertureMode::FOCALLENGTH: + fieldOfView = 2.0 * atan(filmHeight / (2.0 * focalLength)) * 180.0 / OFBX_PI; // Same as vertical ¯\_(ツ)_/¯ + return; + default: + fieldOfView = 60.0; + } + } +}; + +struct Root : Object +{ + Root(const Scene& _scene, const IElement& _element) + : Object(_scene, _element) + { + copyString(name, "RootNode"); + is_node = true; + } + Type getType() const override { return Type::ROOT; } +}; + + +struct Scene : IScene +{ + struct Connection + { + enum Type + { + OBJECT_OBJECT, + OBJECT_PROPERTY, + PROPERTY_OBJECT, + PROPERTY_PROPERTY, + }; + + Type type = OBJECT_OBJECT; + u64 from_object = 0; + u64 to_object = 0; + DataView from_property; + DataView to_property; + }; + + struct ObjectPair + { + const Element* element; + Object* object; + }; + + + int getAnimationStackCount() const override { return (int)m_animation_stacks.size(); } + int getGeometryCount() const override { return (int)m_geometries.size(); } + int getMeshCount() const override { return (int)m_meshes.size(); } + float getSceneFrameRate() const override { return m_scene_frame_rate; } + const GlobalSettings* getGlobalSettings() const override { return &m_settings; } + + const Object* const* getAllObjects() const override { return m_all_objects.empty() ? nullptr : &m_all_objects[0]; } + + + int getAllObjectCount() const override { return (int)m_all_objects.size(); } + + int getEmbeddedDataCount() const override { + return (int)m_videos.size(); + } + + DataView getEmbeddedData(int index) const override { + return m_videos[index].content; + } + + bool isEmbeddedBase64(int index) const override { + return m_videos[index].is_base_64; + } + + const IElementProperty* getEmbeddedBase64Data(int index) const override { + return m_videos[index].base64_property; + } + + DataView getEmbeddedFilename(int index) const override { + return m_videos[index].filename; + } + + const AnimationStack* getAnimationStack(int index) const override + { + assert(index >= 0); + assert(index < m_animation_stacks.size()); + return m_animation_stacks[index]; + } + + + const Mesh* getMesh(int index) const override + { + assert(index >= 0); + assert(index < m_meshes.size()); + return m_meshes[index]; + } + + + const Geometry* getGeometry(int index) const override + { + assert(index >= 0); + assert(index < m_geometries.size()); + return m_geometries[index]; + } + + + const TakeInfo* getTakeInfo(const char* name) const override + { + for (const TakeInfo& info : m_take_infos) + { + if (info.name == name) return &info; + } + return nullptr; + } + + const Camera* getCamera(int index) const override + { + assert(index >= 0); + assert(index < m_cameras.size()); + return m_cameras[index]; + } + + int getCameraCount() const override + { + return (int)m_cameras.size(); + } + + const Light* getLight(int index) const override + { + assert(index >= 0); + assert(index < m_lights.size()); + return m_lights[index]; + } + + int getLightCount() const override + { + return (int)m_lights.size(); + } + + + const IElement* getRootElement() const override { return m_root_element; } + const Object* getRoot() const override { return m_root; } + + + void destroy() override { delete this; } + + + ~Scene() override { + for(Object* ptr : m_all_objects) { + ptr->~Object(); + } + } + + bool finalize(); + + Element* m_root_element = nullptr; + Root* m_root = nullptr; + float m_scene_frame_rate = -1; + GlobalSettings m_settings; + + std::unordered_map m_fake_ids; + std::unordered_map m_object_map; + std::vector m_all_objects; + std::vector m_meshes; + std::vector m_geometries; + std::vector m_animation_stacks; + std::vector m_cameras; + std::vector m_lights; + std::vector m_connections; + std::vector m_data; + std::vector m_take_infos; + std::vector