Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
93.25% |
152 / 163 |
|
91.67% |
11 / 12 |
CRAP | |
0.00% |
0 / 1 |
| Parser | |
93.25% |
152 / 163 |
|
91.67% |
11 / 12 |
63.18 | |
0.00% |
0 / 1 |
| getObjectStreams | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getObjectMap | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getFonts | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| parseFile | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| parseData | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| parse | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
15 | |||
| initFile | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
| initData | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| mapObjects | |
100.00% |
62 / 62 |
|
100.00% |
1 / 1 |
9 | |||
| mapFonts | |
69.44% |
25 / 36 |
|
0.00% |
0 / 1 |
23.30 | |||
| filterPages | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
7 | |||
| getObjects | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
7 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * Pop PHP Framework (https://www.popphp.org/) |
| 4 | * |
| 5 | * @link https://github.com/popphp/popphp-framework |
| 6 | * @author Nick Sagona, III <dev@noladev.com> |
| 7 | * @copyright Copyright (c) 2009-2025 NOLA Interactive, LLC. |
| 8 | * @license https://www.popphp.org/license New BSD License |
| 9 | */ |
| 10 | |
| 11 | /** |
| 12 | * @namespace |
| 13 | */ |
| 14 | namespace Pop\Pdf\Build; |
| 15 | |
| 16 | use Pop\Pdf\Document\AbstractDocument; |
| 17 | |
| 18 | /** |
| 19 | * Pdf parser class |
| 20 | * |
| 21 | * @category Pop |
| 22 | * @package Pop\Pdf |
| 23 | * @author Nick Sagona, III <dev@noladev.com> |
| 24 | * @copyright Copyright (c) 2009-2025 NOLA Interactive, LLC. |
| 25 | * @license https://www.popphp.org/license New BSD License |
| 26 | * @version 5.2.2 |
| 27 | */ |
| 28 | class Parser extends AbstractParser |
| 29 | { |
| 30 | |
| 31 | /** |
| 32 | * Parsed object data streams |
| 33 | * @var array |
| 34 | */ |
| 35 | protected array $objectStreams = []; |
| 36 | |
| 37 | /** |
| 38 | * Object map |
| 39 | * @var array |
| 40 | */ |
| 41 | protected array $objectMap = []; |
| 42 | |
| 43 | /** |
| 44 | * Document fonts |
| 45 | * @var array |
| 46 | */ |
| 47 | protected array $fonts = []; |
| 48 | |
| 49 | /** |
| 50 | * Get the object streams |
| 51 | * |
| 52 | * @return array |
| 53 | */ |
| 54 | public function getObjectStreams(): array |
| 55 | { |
| 56 | return $this->objectStreams; |
| 57 | } |
| 58 | |
| 59 | /** |
| 60 | * Get the object map |
| 61 | * |
| 62 | * @return array |
| 63 | */ |
| 64 | public function getObjectMap(): array |
| 65 | { |
| 66 | return $this->objectMap; |
| 67 | } |
| 68 | |
| 69 | /** |
| 70 | * Get the document fonts |
| 71 | * |
| 72 | * @return array |
| 73 | */ |
| 74 | public function getFonts(): array |
| 75 | { |
| 76 | return $this->fonts; |
| 77 | } |
| 78 | |
| 79 | /** |
| 80 | * Parse from file |
| 81 | * |
| 82 | * @param string $file |
| 83 | * @param mixed $pages |
| 84 | * @throws Exception |
| 85 | * @return AbstractDocument |
| 86 | */ |
| 87 | public function parseFile(string $file, mixed $pages = null): AbstractDocument |
| 88 | { |
| 89 | $this->initFile($file); |
| 90 | return $this->parse($pages); |
| 91 | } |
| 92 | |
| 93 | /** |
| 94 | * Parse from raw data stream |
| 95 | * |
| 96 | * @param string $data |
| 97 | * @param mixed $pages |
| 98 | * @throws Exception |
| 99 | * @return AbstractDocument |
| 100 | */ |
| 101 | public function parseData(string $data, mixed $pages = null): AbstractDocument |
| 102 | { |
| 103 | $this->initData($data); |
| 104 | return $this->parse($pages); |
| 105 | } |
| 106 | |
| 107 | /** |
| 108 | * Parse the data stream |
| 109 | * |
| 110 | * @param mixed $pages |
| 111 | * @return AbstractDocument |
| 112 | */ |
| 113 | public function parse(mixed $pages = null): AbstractDocument |
| 114 | { |
| 115 | $matches = []; |
| 116 | preg_match_all('/\d*\s\d*\sobj(.*?)endobj/sm', $this->data, $matches, PREG_OFFSET_CAPTURE); |
| 117 | |
| 118 | if (isset($matches[0]) && isset($matches[0][0])) { |
| 119 | foreach ($matches[0] as $match) { |
| 120 | if ((!str_contains($match[0], '/Linearized')) && (!str_contains($match[0], '/Type/Metadata'))) { |
| 121 | $this->objectStreams[] = $match[0]; |
| 122 | } |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | // Map the objects by parsing the object streams |
| 127 | $this->mapObjects(); |
| 128 | |
| 129 | if (isset($this->objectMap['pages'])) { |
| 130 | // Map fonts, if any |
| 131 | if (isset($this->objectMap['streams'])) { |
| 132 | $this->mapFonts(); |
| 133 | } |
| 134 | // If certain pages are to be imported, filter out the unwanted pages |
| 135 | if ($pages !== null) { |
| 136 | $this->filterPages($pages); |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | $doc = new \Pop\Pdf\Document(); |
| 141 | |
| 142 | if (isset($this->objectMap['root']) && isset($this->objectMap['root']['object'])) { |
| 143 | $doc->setVersion($this->objectMap['root']['object']->getVersion()); |
| 144 | } |
| 145 | if (isset($this->objectMap['info']) && isset($this->objectMap['info']['object'])) { |
| 146 | $doc->setMetadata($this->objectMap['info']['object']->getMetadata()); |
| 147 | } |
| 148 | |
| 149 | $doc->importObjects($this->getObjects()); |
| 150 | $doc->importFonts($this->getFonts()); |
| 151 | |
| 152 | if (isset($this->objectMap['pages'])) { |
| 153 | foreach ($this->objectMap['pages'] as $i => $page) { |
| 154 | $pg = new \Pop\Pdf\Document\Page($page['width'], $page['height'], $i); |
| 155 | $pg->importPageObject($page['object']); |
| 156 | $doc->addPage($pg); |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | return $doc; |
| 161 | } |
| 162 | |
| 163 | /** |
| 164 | * Initialize the file and get the data |
| 165 | * |
| 166 | * @param string $file |
| 167 | * @throws Exception |
| 168 | * @return Parser |
| 169 | */ |
| 170 | protected function initFile(string $file): Parser |
| 171 | { |
| 172 | if (!file_exists($file)) { |
| 173 | throw new Exception('Error: That PDF file does not exist.'); |
| 174 | } |
| 175 | |
| 176 | $this->file = $file; |
| 177 | $this->data = file_get_contents($this->file); |
| 178 | |
| 179 | $this->objectStreams = []; |
| 180 | $this->objectMap = []; |
| 181 | $this->fonts = []; |
| 182 | |
| 183 | return $this; |
| 184 | } |
| 185 | |
| 186 | /** |
| 187 | * Initialize data |
| 188 | * |
| 189 | * @param string $data |
| 190 | * @return Parser |
| 191 | */ |
| 192 | protected function initData(string $data): Parser |
| 193 | { |
| 194 | $this->data = $data; |
| 195 | |
| 196 | $this->objectStreams = []; |
| 197 | $this->objectMap = []; |
| 198 | $this->fonts = []; |
| 199 | |
| 200 | return $this; |
| 201 | } |
| 202 | |
| 203 | /** |
| 204 | * Map the objects |
| 205 | * |
| 206 | * @return void |
| 207 | */ |
| 208 | protected function mapObjects(): void |
| 209 | { |
| 210 | foreach ($this->objectStreams as $stream) { |
| 211 | switch ($this->getStreamType($stream)) { |
| 212 | case 'root': |
| 213 | $root = PdfObject\RootObject::parse($stream); |
| 214 | $root->setImported(true); |
| 215 | $root->setVersion(substr($this->data, 5, 3)); |
| 216 | $this->objectMap['root'] = [ |
| 217 | 'stream' => $stream, |
| 218 | 'object' => $root, |
| 219 | 'index' => $root->getIndex(), |
| 220 | 'parent' => $root->getParentIndex() |
| 221 | ]; |
| 222 | break; |
| 223 | case 'parent': |
| 224 | $parent = PdfObject\ParentObject::parse($stream); |
| 225 | $parent->setImported(true); |
| 226 | $this->objectMap['parent'] = [ |
| 227 | 'stream' => $stream, |
| 228 | 'object' => $parent, |
| 229 | 'index' => $parent->getIndex(), |
| 230 | 'count' => $parent->getCount(), |
| 231 | 'kids' => $parent->getKids() |
| 232 | ]; |
| 233 | break; |
| 234 | case 'info': |
| 235 | $info = PdfObject\InfoObject::parse($stream); |
| 236 | $info->setImported(true); |
| 237 | $this->objectMap['info'] = [ |
| 238 | 'stream' => $stream, |
| 239 | 'object' => $info, |
| 240 | 'index' => $info->getIndex(), |
| 241 | ]; |
| 242 | break; |
| 243 | case 'page': |
| 244 | if (!isset($this->objectMap['pages'])) { |
| 245 | $this->objectMap['pages'] = []; |
| 246 | } |
| 247 | |
| 248 | $page = PdfObject\PageObject::parse($stream); |
| 249 | $page->setImported(true); |
| 250 | |
| 251 | $this->objectMap['pages'][$page->getIndex()] = [ |
| 252 | 'stream' => $stream, |
| 253 | 'object' => $page, |
| 254 | 'index' => $page->getIndex(), |
| 255 | 'parent' => $page->getParentIndex(), |
| 256 | 'width' => $page->getWidth(), |
| 257 | 'height' => $page->getHeight(), |
| 258 | 'content' => $page->getContent(), |
| 259 | 'annots' => $page->getAnnots(), |
| 260 | 'fonts' => $page->getFonts(), |
| 261 | 'xObjects' => $page->getXObjects() |
| 262 | ]; |
| 263 | break; |
| 264 | case 'stream': |
| 265 | if (!isset($this->objectMap['streams'])) { |
| 266 | $this->objectMap['streams'] = []; |
| 267 | } |
| 268 | $stream = PdfObject\StreamObject::parse($stream); |
| 269 | $stream->setImported(true); |
| 270 | $this->objectMap['streams'][$stream->getIndex()] = [ |
| 271 | 'stream' => $stream, |
| 272 | 'object' => $stream, |
| 273 | 'index' => $stream->getIndex() |
| 274 | ]; |
| 275 | break; |
| 276 | } |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | /** |
| 281 | * Map the fonts, if any |
| 282 | * |
| 283 | * @return void |
| 284 | */ |
| 285 | protected function mapFonts(): void |
| 286 | { |
| 287 | foreach ($this->objectMap['pages'] as $page) { |
| 288 | if (isset($page['fonts']) && (count($page['fonts']) > 0)) { |
| 289 | foreach ($page['fonts'] as $i => $font) { |
| 290 | if (str_contains($this->objectMap['streams'][$i]['stream'], '/BaseFont')) { |
| 291 | $fontName = trim( |
| 292 | substr( |
| 293 | $this->objectMap['streams'][$i]['stream'], |
| 294 | (strpos($this->objectMap['streams'][$i]['stream'], '/BaseFont') + 9) |
| 295 | ) |
| 296 | ); |
| 297 | |
| 298 | if (str_starts_with($fontName, '/')) { |
| 299 | $fontName = substr($fontName, 1); |
| 300 | } |
| 301 | $fontName = ((str_contains($fontName, '/'))) ? |
| 302 | substr($fontName, 0, strpos($fontName, '/')) : |
| 303 | substr($fontName, 0, strpos($fontName, '>')); |
| 304 | |
| 305 | $f = [ |
| 306 | 'name' => trim($fontName), |
| 307 | 'index' => $i, |
| 308 | 'ref' => $font |
| 309 | ]; |
| 310 | |
| 311 | if (!in_array($f, $this->fonts, true)) { |
| 312 | $this->fonts[] = $f; |
| 313 | } |
| 314 | } |
| 315 | } |
| 316 | } |
| 317 | } |
| 318 | |
| 319 | $fontFileObjects = []; |
| 320 | foreach ($this->objectStreams as $stream) { |
| 321 | if (str_contains($stream, '/FontFile')) { |
| 322 | $fontFileObject = substr($stream, strpos($stream, '/FontFile')); |
| 323 | $fontFileObject = substr($fontFileObject, (strpos($fontFileObject, ' ') + 1)); |
| 324 | $fontFileObject = trim(substr($fontFileObject, 0, strpos($fontFileObject, '0 R'))); |
| 325 | $fontFileObjects[] = $fontFileObject; |
| 326 | } |
| 327 | } |
| 328 | |
| 329 | if (!empty($fontFileObjects)) { |
| 330 | foreach ($fontFileObjects as $fontFileObject) { |
| 331 | if (($fontFileObject == 13) && isset($this->objectMap['streams'][$fontFileObject])) { |
| 332 | $fontFile = $this->objectMap['streams'][$fontFileObject]; |
| 333 | $contents = ($fontFile['object']->getEncoding() == 'FlateDecode') ? |
| 334 | gzuncompress(trim($fontFile['object']->getStream())) : $fontFile['object']->getStream(); |
| 335 | |
| 336 | $fontParser = new \Pop\Pdf\Build\Font\TrueType(null, $contents); |
| 337 | } |
| 338 | } |
| 339 | } |
| 340 | } |
| 341 | |
| 342 | /** |
| 343 | * Filter pages |
| 344 | * |
| 345 | * @param mixed $pages |
| 346 | * @return void |
| 347 | */ |
| 348 | protected function filterPages(mixed $pages): void |
| 349 | { |
| 350 | $pages = (!is_array($pages)) ? [$pages] : $pages; |
| 351 | $kids = $this->objectMap['parent']['object']->getKids(); |
| 352 | $keep = []; |
| 353 | foreach ($pages as $page) { |
| 354 | if (isset($kids[$page - 1])) { |
| 355 | $keep[] = $kids[$page - 1]; |
| 356 | } |
| 357 | } |
| 358 | |
| 359 | $this->objectMap['parent']['object']->setKids($keep); |
| 360 | $this->objectMap['parent']['count'] = count($keep); |
| 361 | $this->objectMap['parent']['kids'] = $keep; |
| 362 | |
| 363 | foreach ($kids as $kid) { |
| 364 | if (!in_array($kid, $keep) && isset($this->objectMap['pages'][$kid])) { |
| 365 | unset($this->objectMap['pages'][$kid]); |
| 366 | } |
| 367 | } |
| 368 | } |
| 369 | |
| 370 | /** |
| 371 | * Get the objects for import |
| 372 | * |
| 373 | * @return array |
| 374 | */ |
| 375 | protected function getObjects(): array |
| 376 | { |
| 377 | $objects = []; |
| 378 | foreach ($this->objectMap as $type => $object) { |
| 379 | if (($type == 'root') || ($type == 'parent') || ($type == 'info')) { |
| 380 | $objects[$object['index']] = $object['object']; |
| 381 | } else if ($type == 'streams') { |
| 382 | foreach ($object as $obj) { |
| 383 | $objects[$obj['index']] = $obj['stream']; |
| 384 | } |
| 385 | } |
| 386 | } |
| 387 | |
| 388 | return $objects; |
| 389 | } |
| 390 | |
| 391 | } |