from fastai.data.all import *
from fastai.vision.all import *
path = untar_data(URLs.PETS)
Path.BASE_PATH = path
path.ls()
fnames = get_image_files(path/"images")
dblock = DataBlock()
By itself, a DataBlock is just a blue print on how to assemble your data. It does not do anything until you pass it a source. You can choose to then convert that source into a Datasets or a DataLoaders by using the DataBlock.datasets or DataBlock.dataloaders method. Since we haven’t done anything to get our data ready for batches, the dataloaders method will fail here, but we can have a look at how it gets converted in Datasets. This is where we pass the source of our data, here all our filenames
dsets = dblock.datasets(fnames)
dsets.train[0]
dsets
By default, the data block API assumes we have an input and a target, which is why we see our filename repeated twice.
_The first thing we can do is use a getitems function to actually assemble our items inside the data block
dblock = DataBlock(get_items = get_image_files)
get_image_files
dsets = dblock.datasets(path/"images")
dsets.valid[0]
def label_func(fname):
return "cat" if fname.name[0].isupper() else "dog"
dblock = DataBlock(get_items = get_image_files,
get_y = label_func)
dsets = dblock.datasets(path/"images")
dsets.train[0]
dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
get_items = get_image_files,
get_y = label_func)
dsets = dblock.datasets(path/"images")
dsets.train[0]
dsets.vocab
dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
get_items = get_image_files,
get_y = label_func,
splitter = RandomSplitter())
dsets = dblock.datasets(path/"images")
dsets.train[0]
dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
get_items = get_image_files,
get_y = label_func,
splitter = RandomSplitter(),
item_tfms = Resize(224))
dls = dblock.dataloaders(path/"images")
dls.show_batch()
The way we usually build the data block in one go is by answering a list of questions:
- what is the types of your inputs/targets? Here images and categories
- where is your data? Here in filenames in subfolders
- does something need to be applied to inputs? Here no
- does something need to be applied to the target? Here the label_func function
- how to split the data? Here randomly
- do we need to apply something on formed items? Here a resize
- do we need to apply something on formed batches? Here no
mnist = DataBlock(blocks=(ImageBlock(cls=PILImageBW), CategoryBlock),
get_items=get_image_files,
splitter=GrandparentSplitter(),
get_y=parent_label)
dls = mnist.dataloaders(untar_data(URLs.MNIST_TINY))
dls.show_batch(max_n=9, figsize=(4,4))
pets = DataBlock(blocks=(ImageBlock, CategoryBlock),
get_items=get_image_files,
splitter=RandomSplitter(),
get_y=Pipeline([attrgetter("name"), RegexLabeller(pat = r'^(.*)_\d+.jpg$')]),
item_tfms=Resize(128),
batch_tfms=aug_transforms())
dls = pets.dataloaders(untar_data(URLs.PETS)/"images")
dls.show_batch(max_n=9)
pascal_source = untar_data(URLs.PASCAL_2007)
df = pd.read_csv(pascal_source/"train.csv")
df.head(5)
pascal = DataBlock(blocks=(ImageBlock, MultiCategoryBlock),
splitter=ColSplitter(),
get_x=ColReader(0, pref=pascal_source/"train"),
get_y=ColReader(1, label_delim=' '),
item_tfms=Resize(224),
batch_tfms=aug_transforms())
dls = pascal.dataloaders(df)
dls.show_batch()
pascal = DataBlock(blocks=(ImageBlock, MultiCategoryBlock),
splitter=ColSplitter(),
get_x=lambda x:pascal_source/"train"/f'{x[0]}',
get_y=lambda x:x[1].split(' '),
item_tfms=Resize(224),
batch_tfms=aug_transforms())
dls = pascal.dataloaders(df)
dls.show_batch()
There are various problems that fall in the image localization category:image segmentation (which is a task where you have to predict the class of each pixel of an image), coordinate predictions (predict one or several key points on an image) and object detection (draw a box around objects to detect).
path = untar_data(URLs.CAMVID_TINY)
path.ls()
camvid = DataBlock(blocks=(ImageBlock, MaskBlock(codes = np.loadtxt(path/'codes.txt', dtype=str))),
get_items=get_image_files,
splitter=RandomSplitter(),
get_y=lambda o: path/'labels'/f'{o.stem}_P{o.suffix}',
batch_tfms=aug_transforms())
dls = camvid.dataloaders(path/"images")
dls.show_batch()
biwi_source = untar_data(URLs.BIWI_SAMPLE)
fn2ctr = load_pickle(biwi_source/'centers.pkl')
biwi = DataBlock(blocks=(ImageBlock, PointBlock),
get_items=get_image_files,
splitter=RandomSplitter(),
get_y=lambda o:fn2ctr[o.name].flip(0),
batch_tfms=aug_transforms())
dls = biwi.dataloaders(biwi_source)
dls.show_batch(max_n=9)
coco_source = untar_data(URLs.COCO_TINY)
images, lbl_bbox = get_annotations(coco_source/'train.json')
img2bbox = dict(zip(images, lbl_bbox))
coco_source = untar_data(URLs.COCO_TINY)
images, lbl_bbox = get_annotations(coco_source/'train.json')
img2bbox = dict(zip(images, lbl_bbox))
coco = DataBlock(blocks=(ImageBlock, BBoxBlock, BBoxLblBlock),
get_items=get_image_files,
splitter=RandomSplitter(),
get_y=[lambda o: img2bbox[o.name][0], lambda o: img2bbox[o.name][1]],
item_tfms=Resize(128),
batch_tfms=aug_transforms(),
n_inp=1)
dls = coco.dataloaders(coco_source)
dls.show_batch(max_n=9)
from fastai.text.all import *
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head()
imdb_lm = DataBlock(blocks=TextBlock.from_df('text', is_lm=True),
get_x=ColReader('text'),
splitter=ColSplitter())
dls = imdb_lm.dataloaders(df, bs=64, seq_len=72)
dls.show_batch(max_n=6)
from fastai.tabular.core import *
adult_source = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(adult_source/'adult.csv')
df.head()
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df))
to = TabularPandas(df, procs, cat_names, cont_names, y_names="salary", splits=splits, y_block=CategoryBlock)
dls = to.dataloaders()
dls.show_batch()