libcudf  24.04.00
column_wrapper.hpp
1 /*
2  * Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf_test/column_utilities.hpp>
20 #include <cudf_test/cudf_gtest.hpp>
21 #include <cudf_test/default_stream.hpp>
22 
23 #include <cudf/column/column.hpp>
25 #include <cudf/copying.hpp>
26 #include <cudf/detail/concatenate.hpp>
27 #include <cudf/detail/iterator.cuh>
28 #include <cudf/detail/null_mask.hpp>
29 #include <cudf/detail/utilities/vector_factories.hpp>
33 #include <cudf/null_mask.hpp>
34 #include <cudf/types.hpp>
35 #include <cudf/utilities/bit.hpp>
39 
40 #include <rmm/device_buffer.hpp>
41 #include <rmm/mr/device/per_device_resource.hpp>
42 
43 #include <thrust/copy.h>
44 #include <thrust/functional.h>
45 #include <thrust/host_vector.h>
46 #include <thrust/iterator/constant_iterator.h>
47 #include <thrust/iterator/counting_iterator.h>
48 #include <thrust/iterator/transform_iterator.h>
49 
50 #include <algorithm>
51 #include <iterator>
52 #include <memory>
53 #include <numeric>
54 
55 namespace cudf {
56 namespace test {
57 namespace detail {
67  public:
75  operator column_view() const { return wrapped->view(); }
76 
84  operator mutable_column_view() { return wrapped->mutable_view(); }
85 
91  std::unique_ptr<cudf::column> release() { return std::move(wrapped); }
92 
93  protected:
94  std::unique_ptr<cudf::column> wrapped{};
95 };
96 
100 template <typename From, typename To>
110  template <typename FromT = From,
111  typename ToT = To,
112  std::enable_if_t<std::is_same_v<FromT, ToT>, void>* = nullptr>
113  constexpr ToT operator()(FromT element) const
114  {
115  return element;
116  }
117 
126  template <
127  typename FromT = From,
128  typename ToT = To,
129  std::enable_if_t<!std::is_same_v<FromT, ToT> && (cudf::is_convertible<FromT, ToT>::value ||
130  std::is_constructible_v<ToT, FromT>),
131  void>* = nullptr>
132  constexpr ToT operator()(FromT element) const
133  {
134  return static_cast<ToT>(element);
135  }
136 
145  template <
146  typename FromT = From,
147  typename ToT = To,
148  std::enable_if_t<std::is_integral_v<FromT> && cudf::is_timestamp<ToT>(), void>* = nullptr>
149  constexpr ToT operator()(FromT element) const
150  {
151  return ToT{typename ToT::duration{element}};
152  }
153 };
154 
165 template <typename ElementTo,
166  typename ElementFrom,
167  typename InputIterator,
168  std::enable_if_t<not cudf::is_fixed_point<ElementTo>()>* = nullptr>
169 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
170 {
171  static_assert(cudf::is_fixed_width<ElementTo>(), "Unexpected non-fixed width type.");
172  auto transformer = fixed_width_type_converter<ElementFrom, ElementTo>{};
173  auto transform_begin = thrust::make_transform_iterator(begin, transformer);
174  auto const size = cudf::distance(begin, end);
175  auto const elements = thrust::host_vector<ElementTo>(transform_begin, transform_begin + size);
176  return rmm::device_buffer{
177  elements.data(), size * sizeof(ElementTo), cudf::test::get_default_stream()};
178 }
179 
180 // The two signatures below are identical to the above overload apart from
181 // SFINAE so doxygen sees it as a duplicate.
183 
194 template <typename ElementTo,
195  typename ElementFrom,
196  typename InputIterator,
197  std::enable_if_t<not cudf::is_fixed_point<ElementFrom>() and
198  cudf::is_fixed_point<ElementTo>()>* = nullptr>
199 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
200 {
201  using RepType = typename ElementTo::rep;
202  auto transformer = fixed_width_type_converter<ElementFrom, RepType>{};
203  auto transform_begin = thrust::make_transform_iterator(begin, transformer);
204  auto const size = cudf::distance(begin, end);
205  auto const elements = thrust::host_vector<RepType>(transform_begin, transform_begin + size);
206  return rmm::device_buffer{
207  elements.data(), size * sizeof(RepType), cudf::test::get_default_stream()};
208 }
209 
220 template <typename ElementTo,
221  typename ElementFrom,
222  typename InputIterator,
223  std::enable_if_t<cudf::is_fixed_point<ElementFrom>() and
224  cudf::is_fixed_point<ElementTo>()>* = nullptr>
225 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
226 {
227  using namespace numeric;
228  using RepType = typename ElementTo::rep;
229 
230  auto to_rep = [](ElementTo fp) { return fp.value(); };
231  auto transformer_begin = thrust::make_transform_iterator(begin, to_rep);
232  auto const size = cudf::distance(begin, end);
233  auto const elements = thrust::host_vector<RepType>(transformer_begin, transformer_begin + size);
234  return rmm::device_buffer{
235  elements.data(), size * sizeof(RepType), cudf::test::get_default_stream()};
236 }
238 
252 template <typename ValidityIterator>
253 std::pair<std::vector<bitmask_type>, cudf::size_type> make_null_mask_vector(ValidityIterator begin,
254  ValidityIterator end)
255 {
256  auto const size = cudf::distance(begin, end);
257  auto const num_words = cudf::bitmask_allocation_size_bytes(size) / sizeof(bitmask_type);
258 
259  auto null_mask = std::vector<bitmask_type>(num_words, 0);
260  auto null_count = cudf::size_type{0};
261  for (auto i = 0; i < size; ++i) {
262  if (*(begin + i)) {
263  set_bit_unsafe(null_mask.data(), i);
264  } else {
265  ++null_count;
266  }
267  }
268 
269  return {std::move(null_mask), null_count};
270 }
271 
285 template <typename ValidityIterator>
286 std::pair<rmm::device_buffer, cudf::size_type> make_null_mask(ValidityIterator begin,
287  ValidityIterator end)
288 {
289  auto [null_mask, null_count] = make_null_mask_vector(begin, end);
290  auto d_mask = rmm::device_buffer{null_mask.data(),
293  return {std::move(d_mask), null_count};
294 }
295 
310 template <typename StringsIterator, typename ValidityIterator>
311 auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, ValidityIterator v)
312 {
313  std::vector<char> chars{};
314  std::vector<cudf::size_type> offsets(1, 0);
315  for (auto str = begin; str < end; ++str) {
316  std::string tmp = (*v++) ? std::string(*str) : std::string{};
317  chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
318  offsets.push_back(offsets.back() + tmp.length());
319  }
320  return std::pair(std::move(chars), std::move(offsets));
321 };
322 } // namespace detail
323 
332 template <typename ElementTo, typename SourceElementT = ElementTo>
334  public:
338  fixed_width_column_wrapper() : column_wrapper{}
339  {
340  std::vector<ElementTo> empty;
341  wrapped.reset(
342  new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
343  0,
344  detail::make_elements<ElementTo, SourceElementT>(empty.begin(), empty.end()),
345  rmm::device_buffer{},
346  0});
347  }
348 
367  template <typename InputIterator>
368  fixed_width_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
369  {
370  auto const size = cudf::distance(begin, end);
371  wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
372  size,
373  detail::make_elements<ElementTo, SourceElementT>(begin, end),
374  rmm::device_buffer{},
375  0});
376  }
377 
401  template <typename InputIterator, typename ValidityIterator>
402  fixed_width_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
403  : column_wrapper{}
404  {
405  auto const size = cudf::distance(begin, end);
406  auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
407  wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
408  size,
409  detail::make_elements<ElementTo, SourceElementT>(begin, end),
410  std::move(null_mask),
411  null_count});
412  }
413 
426  template <typename ElementFrom>
427  fixed_width_column_wrapper(std::initializer_list<ElementFrom> elements)
428  : fixed_width_column_wrapper(std::cbegin(elements), std::cend(elements))
429  {
430  }
431 
449  template <typename ElementFrom>
450  fixed_width_column_wrapper(std::initializer_list<ElementFrom> elements,
451  std::initializer_list<bool> validity)
452  : fixed_width_column_wrapper(std::cbegin(elements), std::cend(elements), std::cbegin(validity))
453  {
454  }
455 
473  template <typename ValidityIterator, typename ElementFrom>
474  fixed_width_column_wrapper(std::initializer_list<ElementFrom> element_list, ValidityIterator v)
475  : fixed_width_column_wrapper(std::cbegin(element_list), std::cend(element_list), v)
476  {
477  }
478 
497  template <typename InputIterator>
498  fixed_width_column_wrapper(InputIterator begin,
499  InputIterator end,
500  std::initializer_list<bool> const& validity)
501  : fixed_width_column_wrapper(begin, end, std::cbegin(validity))
502  {
503  }
504 
522  template <typename ElementFrom>
523  fixed_width_column_wrapper(std::initializer_list<std::pair<ElementFrom, bool>> elements)
524  {
525  auto begin =
526  thrust::make_transform_iterator(elements.begin(), [](auto const& e) { return e.first; });
527  auto end = begin + elements.size();
528  auto v =
529  thrust::make_transform_iterator(elements.begin(), [](auto const& e) { return e.second; });
531  }
532 };
533 
539 template <typename Rep>
541  public:
558  template <typename FixedPointRepIterator>
559  fixed_point_column_wrapper(FixedPointRepIterator begin,
560  FixedPointRepIterator end,
561  numeric::scale_type scale)
562  : column_wrapper{}
563  {
564  CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
565 
566  auto const size = cudf::distance(begin, end);
567  auto const elements = thrust::host_vector<Rep>(begin, end);
568  auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
569  auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
570 
571  wrapped.reset(new cudf::column{
572  data_type,
573  size,
574  rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::test::get_default_stream()},
575  rmm::device_buffer{},
576  0});
577  }
578 
591  fixed_point_column_wrapper(std::initializer_list<Rep> values, numeric::scale_type scale)
592  : fixed_point_column_wrapper(std::cbegin(values), std::cend(values), scale)
593  {
594  }
595 
623  template <typename FixedPointRepIterator, typename ValidityIterator>
624  fixed_point_column_wrapper(FixedPointRepIterator begin,
625  FixedPointRepIterator end,
626  ValidityIterator v,
627  numeric::scale_type scale)
628  : column_wrapper{}
629  {
630  CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
631 
632  auto const size = cudf::distance(begin, end);
633  auto const elements = thrust::host_vector<Rep>(begin, end);
634  auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
635  auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
636  auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
637  wrapped.reset(new cudf::column{
638  data_type,
639  size,
640  rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::test::get_default_stream()},
641  std::move(null_mask),
642  null_count});
643  }
644 
662  fixed_point_column_wrapper(std::initializer_list<Rep> elements,
663  std::initializer_list<bool> validity,
664  numeric::scale_type scale)
666  std::cbegin(elements), std::cend(elements), std::cbegin(validity), scale)
667  {
668  }
669 
688  template <typename ValidityIterator>
689  fixed_point_column_wrapper(std::initializer_list<Rep> element_list,
690  ValidityIterator v,
691  numeric::scale_type scale)
692  : fixed_point_column_wrapper(std::cbegin(element_list), std::cend(element_list), v, scale)
693  {
694  }
695 
716  template <typename FixedPointRepIterator>
717  fixed_point_column_wrapper(FixedPointRepIterator begin,
718  FixedPointRepIterator end,
719  std::initializer_list<bool> const& validity,
720  numeric::scale_type scale)
721  : fixed_point_column_wrapper(begin, end, std::cbegin(validity), scale)
722  {
723  }
724 };
725 
730  public:
734  strings_column_wrapper() : strings_column_wrapper(std::initializer_list<std::string>{}) {}
735 
756  template <typename StringsIterator>
757  strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
758  {
759  size_type num_strings = std::distance(begin, end);
760  if (num_strings == 0) {
762  return;
763  }
764  auto all_valid = thrust::make_constant_iterator(true);
765  auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
766  auto d_chars = cudf::detail::make_device_uvector_async(
767  chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
768  auto d_offsets = std::make_unique<cudf::column>(
769  cudf::detail::make_device_uvector_sync(
770  offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
771  rmm::device_buffer{},
772  0);
773  wrapped =
774  cudf::make_strings_column(num_strings, std::move(d_offsets), d_chars.release(), 0, {});
775  }
776 
805  template <typename StringsIterator, typename ValidityIterator>
806  strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
807  : column_wrapper{}
808  {
809  size_type num_strings = std::distance(begin, end);
810  if (num_strings == 0) {
812  return;
813  }
814  auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
815  auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
816  auto d_chars = cudf::detail::make_device_uvector_async(
817  chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
818  auto d_offsets = std::make_unique<cudf::column>(
819  cudf::detail::make_device_uvector_async(
820  offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
821  rmm::device_buffer{},
822  0);
823  auto d_bitmask = cudf::detail::make_device_uvector_sync(
824  null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
825  wrapped = cudf::make_strings_column(
826  num_strings, std::move(d_offsets), d_chars.release(), null_count, d_bitmask.release());
827  }
828 
841  strings_column_wrapper(std::initializer_list<std::string> strings)
842  : strings_column_wrapper(std::cbegin(strings), std::cend(strings))
843  {
844  }
845 
864  template <typename ValidityIterator>
865  strings_column_wrapper(std::initializer_list<std::string> strings, ValidityIterator v)
866  : strings_column_wrapper(std::cbegin(strings), std::cend(strings), v)
867  {
868  }
869 
885  strings_column_wrapper(std::initializer_list<std::string> strings,
886  std::initializer_list<bool> validity)
887  : strings_column_wrapper(std::cbegin(strings), std::cend(strings), std::cbegin(validity))
888  {
889  }
890 
911  strings_column_wrapper(std::initializer_list<std::pair<std::string, bool>> strings)
912  {
913  auto begin =
914  thrust::make_transform_iterator(strings.begin(), [](auto const& s) { return s.first; });
915  auto end = begin + strings.size();
916  auto v =
917  thrust::make_transform_iterator(strings.begin(), [](auto const& s) { return s.second; });
918  wrapped = strings_column_wrapper(begin, end, v).release();
919  }
920 };
921 
930 template <typename KeyElementTo, typename SourceElementT = KeyElementTo>
932  public:
936  operator dictionary_column_view() const { return cudf::dictionary_column_view{wrapped->view()}; }
937 
941  dictionary_column_wrapper() : column_wrapper{}
942  {
944  }
945 
965  template <typename InputIterator>
966  dictionary_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
967  {
968  wrapped =
972  }
973 
999  template <typename InputIterator, typename ValidityIterator>
1000  dictionary_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
1001  : column_wrapper{}
1002  {
1003  wrapped = cudf::dictionary::encode(
1007  }
1008 
1022  template <typename ElementFrom>
1023  dictionary_column_wrapper(std::initializer_list<ElementFrom> elements)
1024  : dictionary_column_wrapper(std::cbegin(elements), std::cend(elements))
1025  {
1026  }
1027 
1046  template <typename ElementFrom>
1047  dictionary_column_wrapper(std::initializer_list<ElementFrom> elements,
1048  std::initializer_list<bool> validity)
1049  : dictionary_column_wrapper(std::cbegin(elements), std::cend(elements), std::cbegin(validity))
1050  {
1051  }
1052 
1071  template <typename ValidityIterator, typename ElementFrom>
1072  dictionary_column_wrapper(std::initializer_list<ElementFrom> element_list, ValidityIterator v)
1073  : dictionary_column_wrapper(std::cbegin(element_list), std::cend(element_list), v)
1074  {
1075  }
1076 
1097  template <typename InputIterator>
1098  dictionary_column_wrapper(InputIterator begin,
1099  InputIterator end,
1100  std::initializer_list<bool> const& validity)
1101  : dictionary_column_wrapper(begin, end, std::cbegin(validity))
1102  {
1103  }
1104 };
1105 
1111 template <>
1113  public:
1118  operator dictionary_column_view() const { return cudf::dictionary_column_view{wrapped->view()}; }
1119 
1125  column_view keys() const { return cudf::dictionary_column_view{wrapped->view()}.keys(); }
1126 
1132  column_view indices() const { return cudf::dictionary_column_view{wrapped->view()}.indices(); }
1133 
1137  dictionary_column_wrapper() : dictionary_column_wrapper(std::initializer_list<std::string>{}) {}
1138 
1159  template <typename StringsIterator>
1160  dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
1161  {
1162  wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
1165  }
1166 
1195  template <typename StringsIterator, typename ValidityIterator>
1196  dictionary_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
1197  : column_wrapper{}
1198  {
1199  wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
1202  }
1203 
1216  dictionary_column_wrapper(std::initializer_list<std::string> strings)
1217  : dictionary_column_wrapper(std::cbegin(strings), std::cend(strings))
1218  {
1219  }
1220 
1239  template <typename ValidityIterator>
1240  dictionary_column_wrapper(std::initializer_list<std::string> strings, ValidityIterator v)
1241  : dictionary_column_wrapper(std::cbegin(strings), std::cend(strings), v)
1242  {
1243  }
1244 
1260  dictionary_column_wrapper(std::initializer_list<std::string> strings,
1261  std::initializer_list<bool> validity)
1262  : dictionary_column_wrapper(std::cbegin(strings), std::cend(strings), std::cbegin(validity))
1263  {
1264  }
1265 };
1266 
1302 template <typename T, typename SourceElementT = T>
1304  public:
1308  operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; }
1309 
1323  template <typename Element = T, std::enable_if_t<cudf::is_fixed_width<Element>()>* = nullptr>
1324  lists_column_wrapper(std::initializer_list<SourceElementT> elements) : column_wrapper{}
1325  {
1326  build_from_non_nested(
1327  std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements).release()));
1328  }
1329 
1345  template <typename Element = T,
1346  typename InputIterator,
1347  std::enable_if_t<cudf::is_fixed_width<Element>()>* = nullptr>
1348  lists_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
1349  {
1350  build_from_non_nested(
1351  std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end).release()));
1352  }
1353 
1369  template <typename Element = T,
1370  typename ValidityIterator,
1371  std::enable_if_t<cudf::is_fixed_width<Element>()>* = nullptr>
1372  lists_column_wrapper(std::initializer_list<SourceElementT> elements, ValidityIterator v)
1373  : column_wrapper{}
1374  {
1375  build_from_non_nested(
1376  std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements, v).release()));
1377  }
1378 
1396  template <typename Element = T,
1397  typename InputIterator,
1398  typename ValidityIterator,
1399  std::enable_if_t<cudf::is_fixed_width<Element>()>* = nullptr>
1400  lists_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
1401  : column_wrapper{}
1402  {
1403  build_from_non_nested(std::move(
1405  }
1406 
1420  template <typename Element = T,
1421  std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
1422  lists_column_wrapper(std::initializer_list<std::string> elements) : column_wrapper{}
1423  {
1424  build_from_non_nested(
1425  std::move(cudf::test::strings_column_wrapper(elements.begin(), elements.end()).release()));
1426  }
1427 
1443  template <typename Element = T,
1444  typename ValidityIterator,
1445  std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
1446  lists_column_wrapper(std::initializer_list<std::string> elements, ValidityIterator v)
1447  : column_wrapper{}
1448  {
1449  build_from_non_nested(
1450  std::move(cudf::test::strings_column_wrapper(elements.begin(), elements.end(), v).release()));
1451  }
1452 
1475  : column_wrapper{}
1476  {
1477  std::vector<bool> valids;
1478  build_from_nested(elements, valids);
1479  }
1480 
1492  lists_column_wrapper() : column_wrapper{}
1493  {
1494  build_from_non_nested(make_empty_column(cudf::type_to_id<T>()));
1495  }
1496 
1522  template <typename ValidityIterator>
1524  ValidityIterator v)
1525  : column_wrapper{}
1526  {
1527  std::vector<bool> validity;
1528  std::transform(elements.begin(),
1529  elements.end(),
1530  v,
1531  std::back_inserter(validity),
1532  [](lists_column_wrapper const& l, bool valid) { return valid; });
1533  build_from_nested(elements, validity);
1534  }
1535 
1543  {
1546  return lists_column_wrapper<T>(
1547  1,
1548  offsets.release(),
1549  values.release(),
1550  valid ? 0 : 1,
1551  valid ? rmm::device_buffer{} : cudf::create_null_mask(1, cudf::mask_state::ALL_NULL));
1552  }
1553 
1554  private:
1565  std::unique_ptr<cudf::column>&& offsets,
1566  std::unique_ptr<cudf::column>&& values,
1568  rmm::device_buffer&& null_mask)
1569  {
1570  // construct the list column
1571  wrapped = make_lists_column(num_rows,
1572  std::move(offsets),
1573  std::move(values),
1574  null_count,
1575  std::move(null_mask),
1577  }
1578 
1595  void build_from_nested(std::initializer_list<lists_column_wrapper<T, SourceElementT>> elements,
1596  std::vector<bool> const& v)
1597  {
1598  auto valids = cudf::detail::make_counting_transform_iterator(
1599  0, [&v](auto i) { return v.empty() ? true : v[i]; });
1600 
1601  // compute the expected hierarchy and depth
1602  auto const hierarchy_and_depth =
1603  std::accumulate(elements.begin(),
1604  elements.end(),
1605  std::pair<column_view, int32_t>{{}, -1},
1606  [](auto acc, lists_column_wrapper const& lcw) {
1607  return lcw.depth > acc.second ? std::pair(lcw.get_view(), lcw.depth) : acc;
1608  });
1609  column_view expected_hierarchy = hierarchy_and_depth.first;
1610  int32_t const expected_depth = hierarchy_and_depth.second;
1611 
1612  // preprocess columns so that every column_view in 'cols' is an equivalent hierarchy
1613  auto [cols, stubs] = preprocess_columns(elements, expected_hierarchy, expected_depth);
1614 
1615  // generate offsets
1616  size_type count = 0;
1617  std::vector<size_type> offsetv;
1618  std::transform(cols.cbegin(),
1619  cols.cend(),
1620  valids,
1621  std::back_inserter(offsetv),
1622  [&](cudf::column_view const& col, bool valid) {
1623  // nulls are represented as a repeated offset
1624  size_type ret = count;
1625  if (valid) { count += col.size(); }
1626  return ret;
1627  });
1628  // add the final offset
1629  offsetv.push_back(count);
1630  auto offsets =
1631  cudf::test::fixed_width_column_wrapper<size_type>(offsetv.begin(), offsetv.end()).release();
1632 
1633  // concatenate them together, skipping children that are null.
1634  std::vector<column_view> children;
1635  thrust::copy_if(
1636  std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
1637 
1638  auto data = children.empty() ? cudf::empty_like(expected_hierarchy)
1639  : cudf::concatenate(children,
1640  cudf::test::get_default_stream(),
1641  rmm::mr::get_current_device_resource());
1642 
1643  // increment depth
1644  depth = expected_depth + 1;
1645 
1646  auto [null_mask, null_count] = [&] {
1647  if (v.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
1648  return cudf::test::detail::make_null_mask(v.begin(), v.end());
1649  }();
1650 
1651  // construct the list column
1652  wrapped = make_lists_column(cols.size(),
1653  std::move(offsets),
1654  std::move(data),
1655  null_count,
1656  std::move(null_mask),
1658  }
1659 
1667  void build_from_non_nested(std::unique_ptr<column> c)
1668  {
1669  CUDF_EXPECTS(c->type().id() == type_id::EMPTY || !cudf::is_nested(c->type()),
1670  "Unexpected type");
1671 
1672  std::vector<size_type> offsetv;
1673  if (c->size() > 0) {
1674  offsetv.push_back(0);
1675  offsetv.push_back(c->size());
1676  }
1677  auto offsets =
1678  cudf::test::fixed_width_column_wrapper<size_type>(offsetv.begin(), offsetv.end()).release();
1679 
1680  // construct the list column. mark this as a root
1681  root = true;
1682  depth = 0;
1683 
1684  size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1;
1685  wrapped = make_lists_column(num_elements,
1686  std::move(offsets),
1687  std::move(c),
1688  0,
1689  rmm::device_buffer{},
1691  }
1692 
1727  std::unique_ptr<column> normalize_column(column_view const& col,
1728  column_view const& expected_hierarchy)
1729  {
1730  // if are at the bottom of the short column, it must be empty
1731  if (col.type().id() != type_id::LIST) {
1732  CUDF_EXPECTS(col.is_empty(), "Encountered mismatched column!");
1733 
1734  auto remainder = empty_like(expected_hierarchy);
1735  return remainder;
1736  }
1737 
1738  lists_column_view lcv(col);
1739  return make_lists_column(
1740  col.size(),
1741  std::make_unique<column>(lcv.offsets()),
1742  normalize_column(lists_column_view(col).child(),
1743  lists_column_view(expected_hierarchy).child()),
1744  col.null_count(),
1746  col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
1748  }
1749 
1750  std::pair<std::vector<column_view>, std::vector<std::unique_ptr<column>>> preprocess_columns(
1751  std::initializer_list<lists_column_wrapper<T, SourceElementT>> const& elements,
1752  column_view& expected_hierarchy,
1753  int expected_depth)
1754  {
1755  std::vector<std::unique_ptr<column>> stubs;
1756  std::vector<column_view> cols;
1757 
1758  // preprocess the incoming lists.
1759  // - unwrap any "root" lists
1760  // - handle incomplete hierarchies
1761  std::transform(elements.begin(),
1762  elements.end(),
1763  std::back_inserter(cols),
1764  [&](lists_column_wrapper const& l) -> column_view {
1765  // depth mismatch. attempt to normalize the short column.
1766  // this function will also catch if this is a legitimately broken
1767  // set of input
1768  if (l.depth < expected_depth) {
1769  if (l.root) {
1770  // this exception distinguishes between the following two cases:
1771  //
1772  // { {{{1, 2, 3}}}, {} }
1773  // In this case, row 0 is a List<List<List<int>>>, whereas row 1 is
1774  // just a List<> which is an apparent mismatch. However, because row 1
1775  // is empty we will allow that to semantically mean
1776  // "a List<List<List<int>>> that's empty at the top level"
1777  //
1778  // { {{{1, 2, 3}}}, {4, 5, 6} }
1779  // In this case, row 1 is a concrete List<int> with actual values.
1780  // There is no way to rectify the differences so we will treat it as a
1781  // true column mismatch.
1782  CUDF_EXPECTS(l.wrapped->size() == 0, "Mismatch in column types!");
1783  stubs.push_back(empty_like(expected_hierarchy));
1784  } else {
1785  stubs.push_back(normalize_column(l.get_view(), expected_hierarchy));
1786  }
1787  return *(stubs.back());
1788  }
1789  // the empty hierarchy case
1790  return l.get_view();
1791  });
1792 
1793  return {std::move(cols), std::move(stubs)};
1794  }
1795 
1796  column_view get_view() const { return root ? lists_column_view(*wrapped).child() : *wrapped; }
1797 
1798  int depth = 0;
1799  bool root = false;
1800 };
1801 
1806  public:
1834  structs_column_wrapper(std::vector<std::unique_ptr<cudf::column>>&& child_columns,
1835  std::vector<bool> const& validity = {})
1836  {
1837  init(std::move(child_columns), validity);
1838  }
1839 
1861  std::initializer_list<std::reference_wrapper<detail::column_wrapper>> child_column_wrappers,
1862  std::vector<bool> const& validity = {})
1863  {
1864  std::vector<std::unique_ptr<cudf::column>> child_columns;
1865  child_columns.reserve(child_column_wrappers.size());
1866  std::transform(child_column_wrappers.begin(),
1867  child_column_wrappers.end(),
1868  std::back_inserter(child_columns),
1869  [&](auto const& column_wrapper) {
1870  return std::make_unique<cudf::column>(column_wrapper.get(),
1871  cudf::test::get_default_stream());
1872  });
1873  init(std::move(child_columns), validity);
1874  }
1875 
1896  template <typename V>
1898  std::initializer_list<std::reference_wrapper<detail::column_wrapper>> child_column_wrappers,
1899  V validity_iter)
1900  {
1901  std::vector<std::unique_ptr<cudf::column>> child_columns;
1902  child_columns.reserve(child_column_wrappers.size());
1903  std::transform(child_column_wrappers.begin(),
1904  child_column_wrappers.end(),
1905  std::back_inserter(child_columns),
1906  [&](auto const& column_wrapper) {
1907  return std::make_unique<cudf::column>(column_wrapper.get(),
1908  cudf::test::get_default_stream());
1909  });
1910  init(std::move(child_columns), validity_iter);
1911  }
1912 
1913  private:
1914  void init(std::vector<std::unique_ptr<cudf::column>>&& child_columns,
1915  std::vector<bool> const& validity)
1916  {
1917  size_type num_rows = child_columns.empty() ? 0 : child_columns[0]->size();
1918 
1919  CUDF_EXPECTS(std::all_of(child_columns.begin(),
1920  child_columns.end(),
1921  [&](auto const& p_column) { return p_column->size() == num_rows; }),
1922  "All struct member columns must have the same row count.");
1923 
1924  CUDF_EXPECTS(validity.size() <= 0 || static_cast<size_type>(validity.size()) == num_rows,
1925  "Validity buffer must have as many elements as rows in the struct column.");
1926 
1927  auto [null_mask, null_count] = [&] {
1928  if (validity.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
1929  return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
1930  }();
1931 
1932  wrapped = cudf::make_structs_column(num_rows,
1933  std::move(child_columns),
1934  null_count,
1935  std::move(null_mask),
1937  }
1938 
1939  template <typename V>
1940  void init(std::vector<std::unique_ptr<cudf::column>>&& child_columns, V validity_iterator)
1941  {
1942  size_type const num_rows = child_columns.empty() ? 0 : child_columns[0]->size();
1943 
1944  CUDF_EXPECTS(std::all_of(child_columns.begin(),
1945  child_columns.end(),
1946  [&](auto const& p_column) { return p_column->size() == num_rows; }),
1947  "All struct member columns must have the same row count.");
1948 
1949  std::vector<bool> validity(num_rows);
1950  std::copy(validity_iterator, validity_iterator + num_rows, validity.begin());
1951 
1952  init(std::move(child_columns), validity);
1953  }
1954 };
1955 
1956 } // namespace test
1957 } // namespace cudf
Utilities for bit and bitmask operations.
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
A container of nullable device data as a column of elements.
Definition: column.hpp:47
Indicator for the logical data type of an element in a column.
Definition: types.hpp:241
A wrapper class for operations on a dictionary column.
column_view indices() const noexcept
Returns the column of indices.
column_view keys() const noexcept
Returns the column of keys.
Given a column-view of lists type, an instance of this class provides a wrapper on this compound colu...
A non-owning, mutable view of device data as a column of elements, some of which may be null as indic...
Base class for a wrapper around a cudf::column.
std::unique_ptr< cudf::column > wrapped
The wrapped column.
std::unique_ptr< cudf::column > release()
Releases internal unique_ptr to wrapped column.
dictionary_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
Construct a nullable dictionary column of strings from the range [begin,end) using the range [v,...
dictionary_column_wrapper(std::initializer_list< std::string > strings, std::initializer_list< bool > validity)
Construct a nullable dictionary column of strings from a list of strings and a list of booleans to in...
column_view indices() const
Access indices column view.
dictionary_column_wrapper(std::initializer_list< std::string > strings)
Construct a non-nullable dictionary column of strings from a list of strings.
dictionary_column_wrapper(StringsIterator begin, StringsIterator end)
Construct a non-nullable dictionary column of strings from the range [begin,end).
dictionary_column_wrapper(std::initializer_list< std::string > strings, ValidityIterator v)
Construct a nullable dictionary column of strings from a list of strings and the range [v,...
column_view keys() const
Access keys column view.
dictionary_column_wrapper()
Default constructor initializes an empty dictionary column of strings.
column_wrapper derived class for wrapping dictionary columns.
dictionary_column_wrapper(std::initializer_list< ElementFrom > elements)
Construct a non-nullable dictionary column of fixed-width elements from an initializer list.
dictionary_column_wrapper(std::initializer_list< ElementFrom > elements, std::initializer_list< bool > validity)
Construct a nullable dictionary column from a list of fixed-width elements using another list to indi...
dictionary_column_wrapper(std::initializer_list< ElementFrom > element_list, ValidityIterator v)
Construct a nullable dictionary column from a list of fixed-width elements and the range [v,...
dictionary_column_wrapper()
Default constructor initializes an empty column with dictionary type.
dictionary_column_wrapper(InputIterator begin, InputIterator end)
Construct a non-nullable dictionary column of the fixed-width elements in the range [begin,...
dictionary_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
Construct a nullable dictionary column of the fixed-width elements in the range [begin,...
dictionary_column_wrapper(InputIterator begin, InputIterator end, std::initializer_list< bool > const &validity)
Construct a nullable dictionary column of the fixed-width elements in the range [begin,...
A wrapper for a column of fixed-width elements.
fixed_point_column_wrapper(FixedPointRepIterator begin, FixedPointRepIterator end, std::initializer_list< bool > const &validity, numeric::scale_type scale)
Construct a nullable column of the decimal elements in the range [begin,end) using a validity initial...
fixed_point_column_wrapper(std::initializer_list< Rep > elements, std::initializer_list< bool > validity, numeric::scale_type scale)
Construct a nullable column from an initializer list of decimal elements using another list to indica...
fixed_point_column_wrapper(FixedPointRepIterator begin, FixedPointRepIterator end, numeric::scale_type scale)
Construct a non-nullable column of the decimal elements in the range [begin,end).
fixed_point_column_wrapper(std::initializer_list< Rep > element_list, ValidityIterator v, numeric::scale_type scale)
Construct a nullable column from an initializer list of decimal elements and the range [v,...
fixed_point_column_wrapper(std::initializer_list< Rep > values, numeric::scale_type scale)
Construct a non-nullable column of decimal elements from an initializer list.
fixed_point_column_wrapper(FixedPointRepIterator begin, FixedPointRepIterator end, ValidityIterator v, numeric::scale_type scale)
Construct a nullable column of the fixed-point elements from a range.
column_wrapper derived class for wrapping columns of fixed-width elements.
fixed_width_column_wrapper(InputIterator begin, InputIterator end, std::initializer_list< bool > const &validity)
Construct a nullable column of the fixed-width elements in the range [begin,end) using a validity ini...
fixed_width_column_wrapper(std::initializer_list< std::pair< ElementFrom, bool >> elements)
Construct a nullable column from a list of pairs of fixed-width elements and validity booleans of eac...
fixed_width_column_wrapper(std::initializer_list< ElementFrom > elements, std::initializer_list< bool > validity)
Construct a nullable column from a list of fixed-width elements using another list to indicate the va...
fixed_width_column_wrapper(std::initializer_list< ElementFrom > elements)
Construct a non-nullable column of fixed-width elements from an initializer list.
fixed_width_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
Construct a nullable column of the fixed-width elements in the range [begin,end) using the range [v,...
fixed_width_column_wrapper(std::initializer_list< ElementFrom > element_list, ValidityIterator v)
Construct a nullable column from a list of fixed-width elements and the range [v, v + element_list....
fixed_width_column_wrapper(InputIterator begin, InputIterator end)
Construct a non-nullable column of the fixed-width elements in the range [begin,end).
fixed_width_column_wrapper()
Default constructor initializes an empty column with proper dtype.
column_wrapper derived class for wrapping columns of lists.
lists_column_wrapper(InputIterator begin, InputIterator end)
Construct a lists column containing a single list of fixed-width type from an iterator range.
lists_column_wrapper(std::initializer_list< lists_column_wrapper< T, SourceElementT >> elements, ValidityIterator v)
Construct a lists column of nested lists from an initializer list of values and a validity iterator.
static lists_column_wrapper< T > make_one_empty_row_column(bool valid=true)
Construct a list column containing a single empty, optionally null row.
lists_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
Construct a lists column containing a single list of fixed-width type from an iterator range and a va...
lists_column_wrapper()
Construct am empty lists column.
lists_column_wrapper(std::initializer_list< lists_column_wrapper< T, SourceElementT >> elements)
Construct a lists column of nested lists from an initializer list of values.
lists_column_wrapper(std::initializer_list< SourceElementT > elements)
Construct a lists column containing a single list of fixed-width type from an initializer list of val...
lists_column_wrapper(std::initializer_list< std::string > elements, ValidityIterator v)
Construct a lists column containing a single list of strings from an initializer list of values and a...
lists_column_wrapper(std::initializer_list< std::string > elements)
Construct a lists column containing a single list of strings from an initializer list of values.
lists_column_wrapper(std::initializer_list< SourceElementT > elements, ValidityIterator v)
Construct a lists column containing a single list of fixed-width type from an initializer list of val...
column_wrapper derived class for wrapping columns of strings.
strings_column_wrapper(std::initializer_list< std::pair< std::string, bool >> strings)
Construct a nullable column from a list of pairs of strings and validity booleans of each string.
strings_column_wrapper()
Default constructor initializes an empty column of strings.
strings_column_wrapper(std::initializer_list< std::string > strings)
Construct a non-nullable column of strings from a list of strings.
strings_column_wrapper(std::initializer_list< std::string > strings, std::initializer_list< bool > validity)
Construct a nullable column of strings from a list of strings and a list of booleans to indicate the ...
strings_column_wrapper(std::initializer_list< std::string > strings, ValidityIterator v)
Construct a nullable column of strings from a list of strings and the range [v, v + strings....
strings_column_wrapper(StringsIterator begin, StringsIterator end)
Construct a non-nullable column of strings from the range [begin,end).
strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
Construct a nullable column of strings from the range [begin,end) using the range [v,...
column_wrapper derived class for wrapping columns of structs.
structs_column_wrapper(std::initializer_list< std::reference_wrapper< detail::column_wrapper >> child_column_wrappers, V validity_iter)
Constructs a struct column from the list of column wrappers for child columns.
structs_column_wrapper(std::initializer_list< std::reference_wrapper< detail::column_wrapper >> child_column_wrappers, std::vector< bool > const &validity={})
Constructs a struct column from the list of column wrappers for child columns.
structs_column_wrapper(std::vector< std::unique_ptr< cudf::column >> &&child_columns, std::vector< bool > const &validity={})
Constructs a struct column from the specified list of pre-constructed child columns.
Class definition for cudf::column.
Column factory APIs.
Column APIs for gather, scatter, split, slice, etc.
Dictionary column encode and decode APIs.
Class definition for fixed point data type.
std::unique_ptr< column > empty_like(column_view const &input)
Initializes and returns an empty column of the same type as the input.
std::unique_ptr< column > make_strings_column(cudf::device_span< thrust::pair< char const *, size_type > const > strings, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Construct a STRING type column given a device span of pointer/size pairs.
std::unique_ptr< cudf::column > make_lists_column(size_type num_rows, std::unique_ptr< column > offsets_column, std::unique_ptr< column > child_column, size_type null_count, rmm::device_buffer &&null_mask, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Construct a LIST type column given offsets column, child column, null mask and null count.
std::unique_ptr< column > make_empty_column(data_type type)
Creates an empty column of the specified type.
std::unique_ptr< cudf::column > make_structs_column(size_type num_rows, std::vector< std::unique_ptr< column >> &&child_columns, size_type null_count, rmm::device_buffer &&null_mask, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Construct a STRUCT column using specified child columns as members.
cudf::size_type null_count(bitmask_type const *bitmask, size_type start, size_type stop, rmm::cuda_stream_view stream=cudf::get_default_stream())
Given a validity bitmask, counts the number of null elements (unset bits) in the range [start,...
std::size_t bitmask_allocation_size_bytes(size_type number_of_bits, std::size_t padding_boundary=64)
Computes the required bytes necessary to represent the specified number of bits with a given padding ...
rmm::device_buffer copy_bitmask(bitmask_type const *mask, size_type begin_bit, size_type end_bit, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Creates a device_buffer from a slice of bitmask defined by a range of indices [begin_bit,...
rmm::device_buffer create_null_mask(size_type size, mask_state state, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Creates a device_buffer for use as a null value indicator bitmask of a column.
std::unique_ptr< column > concatenate(host_span< column_view const > columns_to_concat, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Concatenates multiple columns into a single column.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
std::unique_ptr< column > encode(column_view const &column, data_type indices_type=data_type{type_id::UINT32}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Construct a dictionary column by dictionary encoding an existing column.
scale_type
The scale type for fixed_point.
Definition: fixed_point.hpp:43
std::unique_ptr< column > transform(column_view const &input, std::string const &unary_udf, data_type output_type, bool is_ptx, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Creates a new column by applying a unary function against every element of an input column.
CUDF_HOST_DEVICE void set_bit_unsafe(bitmask_type *bitmask, size_type bit_index)
Sets the specified bit to 1
Definition: bit.hpp:99
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:177
constexpr bool is_fixed_point()
Indicates whether the type T is a fixed-point type.
Definition: traits.hpp:397
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:93
uint32_t bitmask_type
Bitmask type stored as 32-bit unsigned integer.
Definition: types.hpp:94
size_type distance(T f, T l)
Similar to std::distance but returns cudf::size_type and performs static_cast
Definition: types.hpp:108
constexpr bool is_nested()
Indicates whether T is a nested type.
Definition: traits.hpp:577
@ ALL_NULL
Null mask allocated, initialized to all elements NULL.
@ UINT32
4 byte unsigned integer
@ LIST
List elements.
@ STRING
String elements.
@ EMPTY
Always null with no underlying data.
@ DICTIONARY32
Dictionary type using int32 indices.
Class definition for cudf::lists_column_view.
cuDF interfaces
Definition: aggregation.hpp:34
fixed_point and supporting types
Definition: fixed_point.hpp:33
APIs for managing validity bitmasks.
Convert between source and target types when they differ and where possible.
constexpr ToT operator()(FromT element) const
No conversion necessary: Same type, simply copy element to output.
Defines the mapping between cudf::type_id runtime type information and concrete C++ types.
Type declarations for libcudf.